From ef0c56d69f37485aa428dffa56eb7fd13ebd57c8 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:50 -0600 Subject: [PATCH 001/143] mm/hugetlb: convert isolate_hugetlb to folios Patch series "continue hugetlb folio conversion", v3. This series continues the conversion of core hugetlb functions to use folios. This series converts many helper funtions in the hugetlb fault path. This is in preparation for another series to convert the hugetlb fault code paths to operate on folios. This patch (of 8): Convert isolate_hugetlb() to take in a folio and convert its callers to pass a folio. Use page_folio() to convert the callers to use a folio is safe as isolate_hugetlb() operates on a head page. Link: https://lkml.kernel.org/r/20230113223057.173292-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230113223057.173292-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/gup.c | 2 +- mm/hugetlb.c | 16 ++++++++-------- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a51e6daacac64..6e38a019f654b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); +int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,7 +413,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) { return -EBUSY; } diff --git a/mm/gup.c b/mm/gup.c index 7708ffac318f4..918c364d01acb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1905,7 +1905,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(&folio->page, movable_page_list); + isolate_hugetlb(folio, movable_page_list); continue; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0f9df01437722..c4a63605b006f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,7 +2925,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(&old_folio->page, list); + ret = isolate_hugetlb(old_folio, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3000,7 +3000,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7250,19 +7250,19 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct page *page, struct list_head *list) +int isolate_hugetlb(struct folio *folio, struct list_head *list) { int ret = 0; spin_lock_irq(&hugetlb_lock); - if (!PageHeadHuge(page) || - !HPageMigratable(page) || - !get_page_unless_zero(page)) { + if (!folio_test_hugetlb(folio) || + !folio_test_hugetlb_migratable(folio) || + !folio_try_get(folio)) { ret = -EBUSY; goto unlock; } - ClearHPageMigratable(page); - list_move_tail(&page->lru, list); + folio_clear_hugetlb_migratable(folio); + list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b4b30d9b07827..db85c2d37f70a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page, pagelist); + isolated = !isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f1763..a1e8c3e9ab080 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(head, &source); + isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72142fbe76528..b7d85b88ee872 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page, qp->pagelist) && + if (isolate_hugetlb(page_folio(page), qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 206fcdbe67f3d..f6464bce76780 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page, pagelist); + err = isolate_hugetlb(page_folio(page), pagelist); if (!err) err = 1; } From f38bc5d6159c823d317129764ee49e5af92cc816 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:51 -0600 Subject: [PATCH 002/143] mm/hugetlb: convert __update_and_free_page() to folios Change __update_and_free_page() to __update_and_free_hugetlb_folio() by changing its callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c4a63605b006f..24c027582454e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1698,10 +1698,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, enqueue_hugetlb_folio(h, folio); } -static void __update_and_free_page(struct hstate *h, struct page *page) +static void __update_and_free_hugetlb_folio(struct hstate *h, + struct folio *folio) { int i; - struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1714,7 +1714,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, page)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1750,7 +1750,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(page, huge_page_order(h)); + __free_pages(&folio->page, huge_page_order(h)); } } @@ -1790,7 +1790,7 @@ static void free_hpage_workfn(struct work_struct *work) */ h = size_to_hstate(page_size(page)); - __update_and_free_page(h, page); + __update_and_free_hugetlb_folio(h, page_folio(page)); cond_resched(); } @@ -1807,7 +1807,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { - __update_and_free_page(h, &folio->page); + __update_and_free_hugetlb_folio(h, folio); return; } From 3f8cbe901bb3d0843f946e1dc3693504d59c495e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:52 -0600 Subject: [PATCH 003/143] mm/hugetlb: convert dequeue_hugetlb_page functions to folios dequeue_huge_page_node_exact() is changed to dequeue_hugetlb_folio_node_ exact() and dequeue_huge_page_nodemask() is changed to dequeue_hugetlb_ folio_nodemask(). Update their callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 56 ++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 24c027582454e..04e47a69ac226 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1282,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) folio_set_hugetlb_freed(folio); } -static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, + int nid) { - struct page *page; + struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_longterm_pinnable_page(page)) + list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { + if (pin && !folio_is_longterm_pinnable(folio)) continue; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) continue; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - ClearHPageFreed(page); + list_move(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - return page; + return folio; } return NULL; } -static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, - nodemask_t *nmask) +static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; @@ -1320,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { - struct page *page; + struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; @@ -1332,9 +1333,9 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, continue; node = zone_to_nid(zone); - page = dequeue_huge_page_node_exact(h, node); - if (page) - return page; + folio = dequeue_hugetlb_folio_node_exact(h, node); + if (folio) + return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; @@ -1352,7 +1353,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1374,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (!folio) + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); - if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetHPageRestoreReserve(page); + if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } mpol_cond_put(mpol); - return page; + return &folio->page; err: return NULL; @@ -2475,12 +2478,13 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { - struct page *page; + struct folio *folio; - page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); - if (page) { + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + preferred_nid, nmask); + if (folio) { spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } } spin_unlock_irq(&hugetlb_lock); From 0d24b7b68de1e2c863ed2e777f8969be7296f7b5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:53 -0600 Subject: [PATCH 004/143] mm/hugetlb: convert alloc_surplus_huge_page() to folios Change alloc_surplus_huge_page() to alloc_surplus_hugetlb_folio() and update its callers. Link: https://lkml.kernel.org/r/20230113223057.173292-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 04e47a69ac226..5bee90a2c1b31 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2378,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; @@ -2416,7 +2416,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, out_unlock: spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, @@ -2449,7 +2449,7 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; @@ -2460,16 +2460,16 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask); + folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + if (!folio) + folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* page migration callback function */ @@ -2518,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); + struct folio *folio; struct page *page, *tmp; int ret; long i; @@ -2537,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); - if (!page) { + if (!folio) { alloc_ok = false; break; } - list_add(&page->lru, &surplus_list); + list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; @@ -3496,7 +3497,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -3539,7 +3540,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. From 44815b049009d1f98b4444de9985b225eef3025a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:54 -0600 Subject: [PATCH 005/143] mm/hugetlb: increase use of folios in alloc_huge_page() Change hugetlb_cgroup_commit_charge{,_rsvd}(), dequeue_huge_page_vma() and alloc_buddy_huge_page_with_mpol() to use folios so alloc_huge_page() is cleaned by operating on folios until its return. Link: https://lkml.kernel.org/r/20230113223057.173292-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- mm/hugetlb.c | 33 ++++++++++++++++----------------- mm/hugetlb_cgroup.c | 8 ++------ 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f706626a80635..3d82d91f49acb 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, @@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5bee90a2c1b31..d6d78b0bd5827 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1348,7 +1348,7 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } -static struct page *dequeue_huge_page_vma(struct hstate *h, +static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, long chg) @@ -1392,7 +1392,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } mpol_cond_put(mpol); - return &folio->page; + return folio; err: return NULL; @@ -2446,7 +2446,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; @@ -2469,7 +2469,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* page migration callback function */ @@ -3018,7 +3018,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); - struct page *page; struct folio *folio; long map_chg, map_commit; long gbl_chg; @@ -3082,34 +3081,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { + folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!folio) { spin_unlock_irq(&hugetlb_lock); - page = alloc_buddy_huge_page_with_mpol(h, vma, addr); - if (!page) + folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); + if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } - list_add(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); + list_add(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); /* Fall through */ } - folio = page_folio(page); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (deferred_reserve) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, page); + h_cg, folio); } spin_unlock_irq(&hugetlb_lock); - hugetlb_set_page_subpool(page, spool); + hugetlb_set_folio_subpool(folio, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3130,7 +3129,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return page; + return &folio->page; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d9e4425d81ac4..dedd2edb076ec 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -331,19 +331,15 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } From 2e6efda5ef1de344413f3e232485cb828aa3efb8 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:55 -0600 Subject: [PATCH 006/143] mm/hugetlb: convert alloc_migrate_huge_page to folios Change alloc_huge_page_nodemask() to alloc_hugetlb_folio_nodemask() and alloc_migrate_huge_page() to alloc_migrate_hugetlb_folio(). Both functions now return a folio rather than a page. Link: https://lkml.kernel.org/r/20230113223057.173292-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 18 +++++++++--------- mm/migrate.c | 5 ++++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6e38a019f654b..2375c62c61a46 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -719,7 +719,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); @@ -1040,8 +1040,8 @@ static inline struct page *alloc_huge_page(struct vm_area_struct *vma, return NULL; } -static inline struct page * -alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +static inline struct folio * +alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d6d78b0bd5827..8adc964e18289 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2419,7 +2419,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, return folio; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; @@ -2439,7 +2439,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, */ folio_set_hugetlb_temporary(folio); - return &folio->page; + return folio; } /* @@ -2472,8 +2472,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, return folio; } -/* page migration callback function */ -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +/* folio migration callback function */ +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); @@ -2484,12 +2484,12 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } } spin_unlock_irq(&hugetlb_lock); - return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } /* mempolicy aware migration callback */ @@ -2498,16 +2498,16 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, { struct mempolicy *mpol; nodemask_t *nodemask; - struct page *page; + struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* diff --git a/mm/migrate.c b/mm/migrate.c index f6464bce76780..811e76c6fac1a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1663,6 +1663,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; + struct folio *hugetlb_folio = NULL; struct folio *new_folio = NULL; int nid; int zidx; @@ -1677,7 +1678,9 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + mtc->nmask, gfp_mask); + return &hugetlb_folio->page; } if (folio_test_large(folio)) { From 95fa1984fce6c51afd378cacc152420aa82622f4 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:56 -0600 Subject: [PATCH 007/143] mm/hugetlb: convert restore_reserve_on_error() to folios Use the hugetlb folio flag macros inside restore_reserve_on_error() and update the comments to reflect the use of folios. Link: https://lkml.kernel.org/r/20230113223057.173292-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8adc964e18289..f149111466fa2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2819,22 +2819,23 @@ static long vma_del_reservation(struct hstate *h, void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { + struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); - if (HPageRestoreReserve(page)) { + if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map - * manipulation. Clear HPageRestoreReserve so that - * global reserve count will not be incremented + * manipulation. Clear hugetlb_restore_reserve so + * that global reserve count will not be incremented * by free_huge_page. This will make it appear - * as though the reservation for this page was + * as though the reservation for this folio was * consumed. This may prevent the task from - * faulting in the page at a later time. This + * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else @@ -2845,7 +2846,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * This indicates there is an entry in the reserve map * not added by alloc_huge_page. We know it was added * before the alloc_huge_page call, otherwise - * HPageRestoreReserve would be set on the page. + * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ @@ -2854,12 +2855,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * VERY rare out of memory condition. Since * we can not delete the entry, set - * HPageRestoreReserve so that the reserve - * count will be incremented when the page + * hugetlb_restore_reserve so that the reserve + * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from @@ -2875,12 +2876,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * For private mappings, no entry indicates * a reservation is present. Since we can - * not add an entry, set SetHPageRestoreReserve - * on the page so reserve count will be + * not add an entry, set hugetlb_restore_reserve + * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing From 8ef8261295a07df1ff77ef1930b3f8b4a405813f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:57 -0600 Subject: [PATCH 008/143] mm/hugetlb: convert demote_free_huge_page to folios Change demote_free_huge_page to demote_free_hugetlb_folio() and change demote_pool_huge_page() pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f149111466fa2..d20c8b09890e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3579,12 +3579,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, return 0; } -static int demote_free_huge_page(struct hstate *h, struct page *page) +static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i, nid = page_to_nid(page); + int i, nid = folio_nid(folio); struct hstate *target_hstate; - struct folio *folio = page_folio(page); struct page *subpage; + struct folio *inner_folio; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); @@ -3592,18 +3592,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, page); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (rc) { - /* Allocation of vmemmmap failed, we can not demote page */ + /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - set_page_refcounted(page); - add_hugetlb_folio(h, page_folio(page), false); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); return rc; } /* * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count pages. + * sizes as it will not ref count folios. */ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); @@ -3618,15 +3618,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { - subpage = nth_page(page, i); - folio = page_folio(subpage); + subpage = folio_page(folio, i); + inner_folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(folio, + prep_compound_gigantic_folio_for_demote(inner_folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); - set_page_private(subpage, 0); - prep_new_hugetlb_folio(target_hstate, folio, nid); + folio_change_private(inner_folio, NULL); + prep_new_hugetlb_folio(target_hstate, inner_folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3648,7 +3648,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct page *page; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); @@ -3659,11 +3659,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(page, &h->hugepage_freelists[node], lru) { - if (PageHWPoison(page)) + list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + if (folio_test_hwpoison(folio)) continue; - - return demote_free_huge_page(h, page); + return demote_free_hugetlb_folio(h, folio); } } From 443d24ed17e38abb274d15ddfdce070730b721fc Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:30 -0800 Subject: [PATCH 009/143] mm/hugetlb: convert hugetlb_install_page to folios Patch series "convert hugetlb fault functions to folios", v2. This series converts the hugetlb page faulting functions to operate on folios. These include hugetlb_no_page(), hugetlb_wp(), copy_hugetlb_page_range(), and hugetlb_mcopy_atomic_pte(). This patch (of 8): Change hugetlb_install_page() to hugetlb_install_folio(). This reduces one user of the Huge Page flag macros which take in a page. Link: https://lkml.kernel.org/r/20230125170537.96973-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230125170537.96973-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d20c8b09890e1..849206e94742b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4946,14 +4946,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) } static void -hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct page *new_page) +hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct folio *new_folio) { - __SetPageUptodate(new_page); - hugepage_add_new_anon_rmap(new_page, vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + __folio_mark_uptodate(new_folio); + hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - SetHPageMigratable(new_page); + folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -5107,7 +5107,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_page(dst_vma, dst_pte, addr, new); + hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; From 64c13b9992f19592b3fcc796858a7f1471de9e6f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:31 -0800 Subject: [PATCH 010/143] mm/hugetlb: convert hugetlbfs_pagecache_present() to folios Refactor hugetlbfs_pagecache_present() to avoid getting and dropping a refcount on a page. Use RCU and page_cache_next_miss() instead. Link: https://lkml.kernel.org/r/20230125170537.96973-3-sidhartha.kumar@oracle.com Suggested-by: Matthew Wilcox Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 849206e94742b..e506a46b78717 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5651,17 +5651,15 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { - struct address_space *mapping; - pgoff_t idx; - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, vma, address); + bool present; - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + rcu_read_lock(); + present = page_cache_next_miss(mapping, idx, 1) != idx; + rcu_read_unlock(); - page = find_get_page(mapping, idx); - if (page) - put_page(page); - return page != NULL; + return present; } int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, From d991869123cd6fbb59e395dd7449408e7dc728ec Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:32 -0800 Subject: [PATCH 011/143] mm/hugetlb: convert putback_active_hugepage to take in a folio Convert putback_active_hugepage() to folio_putback_active_hugetlb(), this removes one user of the Huge Page macros which take in a page. The callers in migrate.c are also cleaned up by being able to directly use the src and dst folio variables. Link: https://lkml.kernel.org/r/20230125170537.96973-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 8 ++++---- mm/migrate.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2375c62c61a46..067906c5778e7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,7 +175,7 @@ int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void putback_active_hugepage(struct page *page); +void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -429,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void putback_active_hugepage(struct page *page) +static inline void folio_putback_active_hugetlb(struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e506a46b78717..ded5fe790c797 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7300,13 +7300,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void putback_active_hugepage(struct page *page) +void folio_putback_active_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); - SetHPageMigratable(page); - list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + folio_set_hugetlb_migratable(folio); + list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - put_page(page); + folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) diff --git a/mm/migrate.c b/mm/migrate.c index 811e76c6fac1a..c09872cf41b76 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -151,7 +151,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { if (unlikely(PageHuge(page))) { - putback_active_hugepage(page); + folio_putback_active_hugetlb(page_folio(page)); continue; } list_del(&page->lru); @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1383,7 +1383,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1395,7 +1395,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (put_new_page) put_new_page(new_hpage, private); else - putback_active_hugepage(new_hpage); + folio_putback_active_hugetlb(dst); return rc; } From 949625bca5224c98e475d2234e0f22ce754d97d9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:33 -0800 Subject: [PATCH 012/143] mm/hugetlb: convert hugetlb fault paths to use alloc_hugetlb_folio() Change alloc_huge_page() to alloc_hugetlb_folio() by changing all callers to handle the now folio return type of the function. In this conversion, alloc_huge_page_vma() is also changed to alloc_hugetlb_folio_vma() and hugepage_add_new_anon_rmap() is changed to take in a folio directly. Many additions of '&folio->page' are cleaned up in subsequent patches. hugetlbfs_fallocate() is also refactored to use the RCU + page_cache_next_miss() API. Link: https://lkml.kernel.org/r/20230125170537.96973-5-sidhartha.kumar@oracle.com Suggested-by: Mike Kravetz Reported-by: kernel test robot Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 40 ++++---- include/linux/hugetlb.h | 8 +- include/linux/rmap.h | 2 +- mm/hugetlb.c | 201 ++++++++++++++++++++-------------------- mm/mempolicy.c | 6 +- mm/rmap.c | 6 +- 6 files changed, 133 insertions(+), 130 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 48f1a8ad22431..ac16ada48f754 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -819,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -844,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 067906c5778e7..6408f85e57545 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -717,11 +717,11 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -1033,7 +1033,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -1047,7 +1047,7 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a6bd1f0a183d9..a4570da03e58a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -203,7 +203,7 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ded5fe790c797..e104dba1e5ea4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2493,7 +2493,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; @@ -2507,7 +2507,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* @@ -2798,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h, /* * This routine is called to restore reservation information on error paths. - * It should ONLY be called for pages allocated via alloc_huge_page(), and - * the hugetlb mutex should remain held when calling this routine. + * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), + * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the page consumed the reservation. * HPageRestoreReserve is set in the page. * 2) No reservation was in place for the page, so HPageRestoreReserve is - * not set. However, alloc_huge_page always updates the reserve map. + * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the * global reserve count. But, free_huge_page does not have enough context @@ -2814,7 +2814,7 @@ static long vma_del_reservation(struct hstate *h, * reserve count adjustments to be made by free_huge_page. Make sure the * reserve map indicates there is a reservation present. * - * In case 2, simply undo reserve map modifications done by alloc_huge_page. + * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) @@ -2844,8 +2844,8 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * not added by alloc_huge_page. We know it was added - * before the alloc_huge_page call, otherwise + * not added by alloc_hugetlb_folio. We know it was added + * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. @@ -3014,7 +3014,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); @@ -3023,7 +3023,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long map_chg, map_commit; long gbl_chg; int ret, idx; - struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg = NULL; bool deferred_reserve; idx = hstate_index(h); @@ -3130,7 +3130,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return &folio->page; + return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); @@ -4950,7 +4950,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add struct folio *new_folio) { __folio_mark_uptodate(new_folio); - hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + hugepage_add_new_anon_rmap(new_folio, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); @@ -5080,34 +5080,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else if (page_try_dup_anon_rmap(ptepage, true, src_vma)) { pte_t src_pte_old = entry; - struct page *new; + struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(dst_vma, addr, 1); - if (IS_ERR(new)) { + new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + if (IS_ERR(new_folio)) { put_page(ptepage); - ret = PTR_ERR(new); + ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); put_page(ptepage); - /* Install the new huge page if src pte stable */ + /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - new); - put_page(new); + &new_folio->page); + folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); + hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5478,7 +5478,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); - struct page *old_page, *new_page; + struct page *old_page; + struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); @@ -5539,9 +5540,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, haddr, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); - if (IS_ERR(new_page)) { + if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -5586,7 +5587,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - ret = vmf_error(PTR_ERR(new_page)); + ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } @@ -5599,9 +5600,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(&new_folio->page, old_page, address, vma, pages_per_huge_page(h)); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); @@ -5618,12 +5619,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); + hugepage_add_new_anon_rmap(new_folio, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, !unshare)); - SetHPageMigratable(new_page); + make_huge_pte(vma, &new_folio->page, !unshare)); + folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_page = old_page; + new_folio = page_folio(old_page); } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5632,9 +5633,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_page != old_page) - restore_reserve_on_error(h, vma, haddr, new_page); - put_page(new_page); + if (new_folio != page_folio(old_page)) + restore_reserve_on_error(h, vma, haddr, &new_folio->page); + folio_put(new_folio); out_release_old: put_page(old_page); @@ -5753,11 +5754,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page, new_pagecache_page = false; + bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* @@ -5776,9 +5777,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - new_page = false; - page = find_lock_page(mapping, idx); - if (!page) { + new_folio = false; + folio = filemap_lock_folio(mapping, idx); + if (!folio) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -5811,8 +5812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } - page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two @@ -5826,17 +5827,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) - ret = vmf_error(PTR_ERR(page)); + ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(page, address, pages_per_huge_page(h)); - __SetPageUptodate(page); - new_page = true; + clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5845,13 +5846,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, page); - put_page(page); + restore_reserve_on_error(h, vma, haddr, &folio->page); + folio_put(folio); goto out; } - new_pagecache_page = true; + new_pagecache_folio = true; } else { - lock_page(page); + folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -5864,7 +5865,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ - if (unlikely(PageHWPoison(page))) { + if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; @@ -5872,8 +5873,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; @@ -5907,10 +5908,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(page, vma, haddr); + hugepage_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + page_dup_file_rmap(&folio->page, true); + new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even @@ -5923,20 +5924,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); } spin_unlock(ptl); /* - * Only set HPageMigratable in newly allocated pages. Existing pages - * found in the pagecache may not have HPageMigratableset if they have + * Only set hugetlb_migratable in newly allocated pages. Existing pages + * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ - if (new_page) - SetHPageMigratable(page); + if (new_folio) + folio_set_hugetlb_migratable(folio); - unlock_page(page); + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -5945,11 +5946,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, backout: spin_unlock(ptl); backout_unlocked: - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (new_folio && !new_pagecache_folio) + restore_reserve_on_error(h, vma, haddr, &folio->page); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -6173,16 +6174,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct folio *folio; int writable; - bool page_in_pagecache = false; + bool folio_in_pagecache = false; if (is_continue) { ret = -EFAULT; - page = find_lock_page(mapping, idx); - if (!page) + folio = filemap_lock_folio(mapping, idx); + if (!folio) goto out; - page_in_pagecache = true; + folio_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -6193,34 +6194,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } - ret = copy_huge_page_from_user(page, + ret = copy_huge_page_from_user(&folio->page, (const void __user *) src_addr, pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; - /* Free the allocated page which may have + /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); - /* Allocate a temporary page to hold the copied + /* Allocate a temporary folio to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); - if (!page) { + folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; /* Set the outparam pagep and return to the caller to * copy the contents outside the lock. Don't free the * page. @@ -6236,25 +6237,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; } - copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6269,16 +6270,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (ret) goto out_release_nounlock; - page_in_pagecache = true; + folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) goto out_release_unlock; /* @@ -6290,10 +6291,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) - page_dup_file_rmap(page, true); + if (folio_in_pagecache) + page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6304,7 +6305,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6326,20 +6327,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spin_unlock(ptl); if (!is_continue) - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); out_release_nounlock: - if (!page_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + if (!folio_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ @@ -6871,7 +6872,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* * pages in this range were added to the reserve * map between region_chg and region_add. This - * indicates a race with alloc_huge_page. Adjust + * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b7d85b88ee872..2202e76192b29 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1219,9 +1219,11 @@ static struct page *new_page(struct page *page, unsigned long start) break; } - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), + if (folio_test_hugetlb(src)) { + dst = alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); + return &dst->page; + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; diff --git a/mm/rmap.c b/mm/rmap.c index 43760d6220409..e143ab8ed5e22 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2533,15 +2533,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, !!(flags & RMAP_EXCLUSIVE)); } -void hugepage_add_new_anon_rmap(struct page *page, +void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ From 5fd8d124d284c6d7c6e0fb0669f386d2da775eaa Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:34 -0800 Subject: [PATCH 013/143] mm/hugetlb: convert restore_reserve_on_error to take in a folio Every caller of restore_reserve_on_error() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 21 ++++++++++----------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ac16ada48f754..e2b8a00696bda 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -873,7 +873,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6408f85e57545..20ceaaea1697a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -726,7 +726,7 @@ struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *v int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page); + unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e104dba1e5ea4..aad028706fbc8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2802,9 +2802,9 @@ static long vma_del_reservation(struct hstate *h, * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: - * 1) A reservation was in place and the page consumed the reservation. - * HPageRestoreReserve is set in the page. - * 2) No reservation was in place for the page, so HPageRestoreReserve is + * 1) A reservation was in place and the folio consumed the reservation. + * hugetlb_restore_reserve is set in the folio. + * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the @@ -2817,9 +2817,8 @@ static long vma_del_reservation(struct hstate *h, * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page) + unsigned long address, struct folio *folio) { - struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { @@ -5102,7 +5101,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - &new_folio->page); + new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; @@ -5634,7 +5633,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * unshare) */ if (new_folio != page_folio(old_page)) - restore_reserve_on_error(h, vma, haddr, &new_folio->page); + restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: put_page(old_page); @@ -5846,7 +5845,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_put(folio); goto out; } @@ -5947,7 +5946,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spin_unlock(ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_unlock(folio); folio_put(folio); @@ -6210,7 +6209,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied @@ -6339,7 +6338,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } From fd53bbe4fb3443aab0d815d5cac426d9cc323244 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:35 -0800 Subject: [PATCH 014/143] mm/hugetlb: convert hugetlb_add_to_page_cache to take in a folio Every caller of hugetlb_add_to_page_cache() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e2b8a00696bda..43af1753de5f2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -871,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); __folio_mark_uptodate(folio); - error = hugetlb_add_to_page_cache(&folio->page, mapping, index); + error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20ceaaea1697a..df6dd624ccfe8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -723,7 +723,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aad028706fbc8..ab30f85506311 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5662,10 +5662,9 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return present; } -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; @@ -5677,7 +5676,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, __folio_clear_locked(folio); return err; } - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file @@ -5836,7 +5835,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -6269,7 +6268,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; From 9a1f0841dc2d672df72a218948bb426c34669433 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:36 -0800 Subject: [PATCH 015/143] mm/hugetlb: convert hugetlb_wp() to take in a folio Change the pagecache_page argument of hugetlb_wp to pagecache_folio. Replaces a call to find_lock_page() with filemap_lock_folio(). Link: https://lkml.kernel.org/r/20230125170537.96973-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: gerald.schaefer@linux.ibm.com Cc: John Hubbard Cc: kernel test robot Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab30f85506311..b6cbba105ffc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5472,7 +5472,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct page *pagecache_page, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; @@ -5529,7 +5529,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_page != pagecache_page) + page_folio(old_page) != pagecache_folio) outside_reserve = 1; get_page(old_page); @@ -5922,7 +5922,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); } spin_unlock(ptl); @@ -5985,7 +5985,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, u32 hash; pgoff_t idx; struct page *page = NULL; - struct page *pagecache_page = NULL; + struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; @@ -6067,7 +6067,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = find_lock_page(mapping, idx); + pagecache_folio = filemap_lock_folio(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -6087,9 +6087,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, }; spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -6098,11 +6098,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and - * pagecache_page, so here we need take the former one - * when page != pagecache_page or !pagecache_page. + * pagecache_folio, so here we need take the former one + * when page != pagecache_folio or !pagecache_folio. */ page = pte_page(entry); - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) if (!trylock_page(page)) { need_wait_lock = 1; goto out_ptl; @@ -6113,7 +6113,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_page, ptl); + pagecache_folio, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); @@ -6124,15 +6124,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) unlock_page(page); put_page(page); out_ptl: spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); From ff09a0042b0ecfe6393e708715279791b36dbe29 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:37 -0800 Subject: [PATCH 016/143] Documentation/mm: update hugetlbfs documentation to mention alloc_hugetlb_folio Link: https://lkml.kernel.org/r/20230125170537.96973-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- Documentation/mm/hugetlbfs_reserv.rst | 21 ++++++++++--------- .../zh_CN/mm/hugetlbfs_reserv.rst | 14 ++++++------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index f143954e0d056..611728c49bff1 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -181,14 +181,14 @@ Consuming Reservations/Allocating a Huge Page Reservations are consumed when huge pages associated with the reservations are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: +is performed within the routine alloc_hugetlb_folio():: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page is passed a VMA pointer and a virtual address, so it can +alloc_hugetlb_folio is passed a VMA pointer and a virtual address, so it can consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves +alloc_hugetlb_folio takes the argument avoid_reserve which indicates reserves should not be used even if it appears they have been set aside for the specified address. The avoid_reserve argument is most often used in the case of Copy on Write and Page Migration where additional copies of an existing @@ -208,7 +208,8 @@ a reservation for the allocation. After determining whether a reservation exists and can be used for the allocation, the routine dequeue_huge_page_vma() is called. This routine takes two arguments related to reservations: -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- avoid_reserve, this is the same value/argument passed to + alloc_hugetlb_folio(). - chg, even though this argument is of type long only the values 0 or 1 are passed to dequeue_huge_page_vma. If the value is 0, it indicates a reservation exists (see the section "Memory Policy and Reservations" for @@ -233,9 +234,9 @@ the scope reservations. Even if a surplus page is allocated, the same reservation based adjustments as above will be made: SetPagePrivate(page) and resv_huge_pages--. -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. +After obtaining a new hugetlb folio, (folio)->_hugetlb_subpool is set to the +value of the subpool associated with the page if it exists. This will be used +for subpool accounting when the folio is freed. The routine vma_commit_reservation() is then called to adjust the reserve map based on the consumption of the reservation. In general, this involves @@ -246,8 +247,8 @@ was no reservation in a shared mapping or this was a private mapping a new entry must be created. It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would +to vma_needs_reservation() at the beginning of alloc_hugetlb_folio() and the +call to vma_commit_reservation() after the folio was allocated. This would be possible if hugetlb_reserve_pages was called for the same page in a shared mapping. In such cases, the reservation count and subpool free page count will be off by one. This rare condition can be identified by comparing the diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index 752e5696cd476..826a50c47389e 100644 --- a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -142,14 +142,14 @@ HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 消耗预留/分配一个巨页 =========================== -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_hugetlb_folio() 中进行的:: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +alloc_hugetlb_folio被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_hugetlb_folio需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 外拷贝被分配。 @@ -162,7 +162,7 @@ vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留 确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 的参数: -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- avoid_reserve,这是传递给alloc_hugetlb_folio()的同一个值/参数。 - chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 @@ -179,7 +179,7 @@ free_huge_pages的值被递减。如果有一个与该页相关的预留,将 的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: SetPagePrivate(page) 和 resv_huge_pages--. -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +在获得一个新的巨页后,(folio)->_hugetlb_subpool被设置为与该页面相关的子池的值,如果它存在的话。当页 面被释放时,这将被用于子池的计数。 然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 @@ -199,7 +199,7 @@ SetPagePrivate(page)和resv_huge_pages-。 已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 一个新的条目。 -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +在alloc_hugetlb_folio()开始调用vma_needs_reservation()和页面分配后调用 vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 From d202979e26d6f2d2cb2b3bc5fbeb211c67134a49 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 3 Feb 2023 16:28:40 -0500 Subject: [PATCH 017/143] mm: fix memcpy_from_file_folio() integer underflow If we have a HIGHMEM system with a large folio, 'offset' may be larger than PAGE_SIZE, and so min_t will cap at 'len' instead of the intended end-of-page. That can overflow into the next page which is likely to be unmapped and fault, but could theoretically copy the wrong data. Link: https://lkml.kernel.org/r/Y919vmSrtAgsf6K3@casper.infradead.org Fixes: 00cdf76012ab ("mm: add memcpy_from_file_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: "Fabio M. De Francesco" Cc: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 348701dae77fd..b06254e76d99a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -431,9 +431,10 @@ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); - if (folio_test_highmem(folio)) + if (folio_test_highmem(folio)) { + offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); - else + } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); From 149071d8352766a7750f446a937559501ed9897e Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 5 Dec 2022 15:40:58 -0800 Subject: [PATCH 018/143] mm/khugepaged: recover from poisoned anonymous memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Memory poison recovery in khugepaged collapsing", v9. Problem ======= Memory DIMMs are subject to multi-bit flips, i.e. memory errors. As memory size and density increase, the chances of and number of memory errors increase. The increasing size and density of server RAM in the data center and cloud have shown increased uncorrectable memory errors. There are already mechanisms in the kernel to recover from uncorrectable memory errors. This series of patches provides the recovery mechanism for the particular kernel agent khugepaged when it collapses memory pages. Impact ====== The main reason we chose to make khugepaged collapsing tolerant of memory failures was its high possibility of accessing poisoned memory while performing functionally optional compaction actions. Standard applications typically don't have strict requirements on the size of their pages. So they are given 4K pages by the kernel. The kernel is able to improve application performance by either 1) giving applications 2M pages to begin with, or 2) collapsing 4K pages into 2M pages when possible. This collapsing operation is done by khugepaged, a kernel agent that is constantly scanning memory. When collapsing 4K pages into a 2M page, it must copy the data from the 4K pages into a physically contiguous 2M page. Therefore, as long as there exists one poisoned cache line in collapsible 4K pages, khugepaged will eventually access it. The current impact to users is a machine check exception triggered kernel panic. However, khugepaged’s compaction operations are not functionally required kernel actions. Therefore making khugepaged tolerant to poisoned memory will greatly improve user experience. This patch series is for cases where khugepaged is the first guy that detects the memory errors on the poisoned pages. IOW, the pages are not known to have memory errors when khugepaged collapsing gets to them. In our observation, this happens frequently when the huge page ratio of the system is relatively low, which is fairly common in virtual machines running on cloud. Solution ======== As stated before, it is less desirable to crash the system only because khugepaged accesses poisoned pages while it is collapsing 4K pages. The high level idea of this patch series is to skip the group of pages (usually 512 4K-size pages) once khugepaged finds one of them is poisoned, as these pages have become ineligible to be collapsed. We are also careful to unwind operations khuagepaged has performed before it detects memory failures. For example, before copying and collapsing a group of anonymous pages into a huge page, the source pages will be isolated and their page table is unlinked from their PMD. These operations need to be undone in order to ensure these pages are not changed/lost from the perspective of other threads (both user and kernel space). As for file backed memory pages, there already exists a rollback case. This patch just extends it so that khugepaged also correctly rolls back when it fails to copy poisoned 4K pages. This patch (of 2): Make __collapse_huge_page_copy return whether copying anonymous pages succeeded, and make collapse_huge_page handle the return status. Break existing PTE scan loop into two for-loops. The first loop copies source pages into target huge page, and can fail gracefully when running into memory errors in source pages. If copying all pages succeeds, the second loop releases and clears up these normal pages. Otherwise, the second loop rolls back the page table and page states by: - re-establishing the original PTEs-to-PMD connection. - releasing source pages back to their LRU list. Tested manually: 0. Enable khugepaged on system under test. 1. Start a two-thread application. Each thread allocates a chunk of non-huge anonymous memory buffer. 2. Pick 4 random buffer locations (2 in each thread) and inject uncorrectable memory errors at corresponding physical addresses. 3. Signal both threads to make their memory buffer collapsible, i.e. calling madvise(MADV_HUGEPAGE). 4. Wait and check kernel log: khugepaged is able to recover from poisoned pages and skips collapsing them. 5. Signal both threads to inspect their buffer contents and make sure no data corruption. Link: https://lkml.kernel.org/r/20221205234059.42971-1-jiaqiyan@google.com Link: https://lkml.kernel.org/r/20221205234059.42971-2-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Cc: Kefeng Wang Cc: Kirill A. Shutemov Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 3 +- mm/khugepaged.c | 179 ++++++++++++++++++++++------- 2 files changed, 139 insertions(+), 43 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 3e6fb05852f9a..46cce509957ba 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -36,7 +36,8 @@ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ EM( SCAN_TRUNCATED, "truncated") \ - EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ + EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ + EMe(SCAN_COPY_MC, "copy_poisoned_page") \ #undef EM #undef EMe diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b39ab219d5b75..1641938dc34c0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,7 @@ enum scan_result { SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, + SCAN_COPY_MC, }; #define CREATE_TRACE_POINTS @@ -535,6 +537,27 @@ static bool is_refcount_suitable(struct page *page) return page_count(page) == expected_refcount; } +/* + * Copies memory with #MC in source page (@from) handled. Returns number + * of bytes not copied if there was an exception; otherwise 0 for success. + * Note handling #MC requires arch opt-in. + */ +static int copy_mc_page(struct page *to, struct page *from) +{ + char *vfrom, *vto; + unsigned long ret; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (ret == 0) + kmsan_copy_page_meta(to, from); + kunmap_local(vto); + kunmap_local(vfrom); + + return ret; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, @@ -675,56 +698,124 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, return result; } -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl, - struct list_head *compound_pagelist) +/* + * __collapse_huge_page_copy - attempts to copy memory contents from normal + * pages to a hugepage. Cleans up the normal pages if copying succeeds; + * otherwise restores the original page table and releases isolated normal pages. + * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. + * + * @pte: starting of the PTEs to copy from + * @page: the new hugepage to copy contents to + * @pmd: pointer to the new hugepage's PMD + * @rollback: the original normal pages' PMD + * @vma: the original normal pages' virtual memory area + * @address: starting address to copy + * @pte_ptl: lock on normal pages' PTEs + * @compound_pagelist: list that stores compound pages + */ +static int __collapse_huge_page_copy(pte_t *pte, + struct page *page, + pmd_t *pmd, + pmd_t rollback, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *pte_ptl, + struct list_head *compound_pagelist) { struct page *src_page, *tmp; pte_t *_pte; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, page++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval; + unsigned long _address; + spinlock_t *pmd_ptl; + int result = SCAN_SUCCEED; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, address); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { + /* + * Copying pages' contents is subject to memory poison at any iteration. + */ + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, page++, _address += PAGE_SIZE) { + pteval = *_pte; + + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) + clear_user_highpage(page, _address); + else { + src_page = pte_page(pteval); + if (copy_mc_page(page, src_page) > 0) { + result = SCAN_COPY_MC; + break; + } + } + } + + if (likely(result == SCAN_SUCCEED)) { + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pteval = *_pte; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * pte_ptl mostly unnecessary. + */ + spin_lock(pte_ptl); + pte_clear(vma->vm_mm, _address, _pte); + spin_unlock(pte_ptl); + } + } else { + src_page = pte_page(pteval); + if (!PageCompound(src_page)) + release_pte_page(src_page); /* - * ptl mostly unnecessary. + * pte_ptl mostly unnecessary, but preempt has + * to be disabled to update the per-cpu stats + * inside page_remove_rmap(). */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); + spin_lock(pte_ptl); + ptep_clear(vma->vm_mm, _address, _pte); + page_remove_rmap(src_page, vma, false); + spin_unlock(pte_ptl); + free_page_and_swap_cache(src_page); + } + } + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + mod_node_page_state(page_pgdat(src_page), + NR_ISOLATED_ANON + page_is_file_lru(src_page), + -compound_nr(src_page)); + unlock_page(src_page); + free_swap_cache(src_page); + putback_lru_page(src_page); + } + } else { + /* + * Re-establish the regular PMD that points to the regular + * page table. Restoring PMD needs to be done prior to + * releasing pages. Since pages are still isolated and + * locked here, acquiring anon_vma_lock_write is unnecessary. + */ + pmd_ptl = pmd_lock(vma->vm_mm, pmd); + pmd_populate(vma->vm_mm, pmd, pmd_pgtable(rollback)); + spin_unlock(pmd_ptl); + /* + * Release both raw and compound pages isolated + * in __collapse_huge_page_isolate. + */ + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pteval = *_pte; + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) { + src_page = pte_page(pteval); + if (!PageCompound(src_page)) + release_pte_page(src_page); } - } else { - src_page = pte_page(pteval); - copy_user_highpage(page, src_page, address, vma); - if (!PageCompound(src_page)) - release_pte_page(src_page); - /* - * ptl mostly unnecessary, but preempt has to - * be disabled to update the per-cpu stats - * inside page_remove_rmap(). - */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, vma, false); - spin_unlock(ptl); - free_page_and_swap_cache(src_page); + } + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + release_pte_page(src_page); } } - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { - list_del(&src_page->lru); - mod_node_page_state(page_pgdat(src_page), - NR_ISOLATED_ANON + page_is_file_lru(src_page), - -compound_nr(src_page)); - unlock_page(src_page); - free_swap_cache(src_page); - putback_lru_page(src_page); - } + return result; } static void khugepaged_alloc_sleep(void) @@ -1092,9 +1183,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, - &compound_pagelist); + result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, + vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); + if (unlikely(result != SCAN_SUCCEED)) + goto out_up_write; + /* * spin_lock() below is not the equivalent of smp_wmb(), but * the smp_wmb() inside __SetPageUptodate() can be reused to From c3af2e105e8327219ecac1762ad684b916c62066 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 5 Dec 2022 15:40:59 -0800 Subject: [PATCH 019/143] mm/khugepaged: recover from poisoned file-backed memory Make collapse_file roll back when copying pages failed. More concretely: - extract copying operations into a separate loop - postpone the updates for nr_none until both scanning and copying succeeded - postpone joining small xarray entries until both scanning and copying succeeded - postpone the update operations to NR_XXX_THPS until both scanning and copying succeeded - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but copying failed Tested manually: 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk. 1. Start a two-thread application. Each thread allocates a chunk of non-huge memory buffer from /mnt/ramdisk. 2. Pick 4 random buffer address (2 in each thread) and inject uncorrectable memory errors at physical addresses. 3. Signal both threads to make their memory buffer collapsible, i.e. calling madvise(MADV_HUGEPAGE). 4. Wait and then check kernel log: khugepaged is able to recover from poisoned pages by skipping them. 5. Signal both threads to inspect their buffer contents and make sure no data corruption. Link: https://lkml.kernel.org/r/20221205234059.42971-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Kirill A. Shutemov Cc: Kefeng Wang Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1641938dc34c0..eb38bd1b1b2f0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1862,6 +1862,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; + struct page *page; + struct page *tmp; + struct folio *folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1906,8 +1909,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_set(&xas, start); for (index = start; index < end; index++) { - struct page *page = xas_next(&xas); - struct folio *folio; + page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -2089,10 +2091,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } nr = thp_nr_pages(hpage); - if (is_shmem) - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); - else { - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -2103,21 +2102,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } - - if (nr_none) { - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); - /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); - } - - /* Join all the small entries into a single multi-index entry */ - xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -2130,21 +2118,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, try_to_unmap_flush(); if (result == SCAN_SUCCEED) { - struct page *page, *tmp; - struct folio *folio; - /* * Replacing old pages with new one has succeeded, now we - * need to copy the content and free the old pages. + * attempt to copy the contents. */ index = start; - list_for_each_entry_safe(page, tmp, &pagelist, lru) { + list_for_each_entry(page, &pagelist, lru) { while (index < page->index) { clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(hpage + (page->index % HPAGE_PMD_NR), - page); + if (copy_mc_page(hpage + (page->index % HPAGE_PMD_NR), + page) > 0) { + result = SCAN_COPY_MC; + break; + } + index++; + } + while (result == SCAN_SUCCEED && index < end) { + clear_highpage(hpage + (index % HPAGE_PMD_NR)); + index++; + } + } + + if (result == SCAN_SUCCEED) { + /* + * Copying old pages to huge one has succeeded, now we + * need to free the old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -2152,12 +2154,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, ClearPageUnevictable(page); unlock_page(page); put_page(page); - index++; } - while (index < end) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); - index++; + + xas_lock_irq(&xas); + if (is_shmem) + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); + else + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + + if (nr_none) { + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } + /* Join all the small entries into a single multi-index entry. */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, hpage); + xas_unlock_irq(&xas); folio = page_folio(hpage); folio_mark_uptodate(folio); @@ -2175,8 +2188,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, unlock_page(hpage); hpage = NULL; } else { - struct page *page; - /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); if (nr_none) { @@ -2210,6 +2221,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); } VM_BUG_ON(nr_none); + /* + * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only. + * This undo is not needed unless failure is due to SCAN_COPY_MC. + */ + if (!is_shmem && result == SCAN_COPY_MC) + filemap_nr_thps_dec(mapping); + xas_unlock_irq(&xas); hpage->mapping = NULL; From dbb72d694406b705e7a4840c85f41d70d52b7e8e Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 30 Dec 2022 09:12:44 +0800 Subject: [PATCH 020/143] ksm: abstract the function try_to_get_old_rmap_item Patch series "ksm: support tracking KSM-placed zero-pages", v5. The core idea of this patch set is to enable users to perceive the number of any pages merged by KSM, regardless of whether use_zero_page switch has been turned on, so that users can know how much free memory increase is really due to their madvise(MERGEABLE) actions. But the problem is, when enabling use_zero_pages, all empty pages will be merged with kernel zero pages instead of with each other as use_zero_pages is disabled, and then these zero-pages are no longer monitored by KSM. The motivations for me to do this contains three points: 1) MADV_UNMERGEABLE and other ways to trigger unsharing will *not* unshare the shared zeropage as placed by KSM (which is against the MADV_UNMERGEABLE documentation at least); see the link: https://lore.kernel.org/lkml/4a3daba6-18f9-d252-697c-197f65578c44@redhat.com/ 2) We cannot know how many pages are zero pages placed by KSM when enabling use_zero_pages, which hides the critical information about how much actual memory are really saved by KSM. Knowing how many ksm-placed zero pages are helpful for user to use the policy of madvise (MERGEABLE) better because they can see the actual profit brought by KSM. 3) The zero pages placed-by KSM are different from those initial empty page (filled with zeros) which are never touched by applications. The former is active-merged by KSM while the later have never consume actual memory. use_zero_pages is useful, not only because of cache colouring as described in doc, but also because use_zero_pages can accelerate merging empty pages when there are plenty of empty pages (full of zeros) as the time of page-by-page comparisons (unstable_tree_search_insert) is saved. So we hope to implement the support for ksm zero page tracking without affecting the feature of use_zero_pages. Zero pages may be the most common merged pages in actual environment(not only VM but also including other application like containers). Enabling use_zero_pages in the environment with plenty of empty pages(full of zeros) will be very useful. Users and app developer can also benefit from knowing the proportion of zero pages in all merged pages to optimize applications. With the patch series, we can both unshare zero-pages(KSM-placed) accurately and count ksm zero pages with enabling use_zero_pages. This patch (of 6): A new function try_to_get_old_rmap_item is abstracted from get_next_rmap_item. This function will be reused by the subsequent patches about counting ksm_zero_pages. The patch improves the readability and reusability of KSM code. Link: https://lkml.kernel.org/r/202212300911327101708@zte.com.cn Link: https://lkml.kernel.org/r/202212300912449061763@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- mm/ksm.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index ad591b779d534..79ffadd18f8af 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2213,23 +2213,36 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } } -static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, - struct ksm_rmap_item **rmap_list, - unsigned long addr) +static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr, + struct ksm_rmap_item **rmap_list) { - struct ksm_rmap_item *rmap_item; - while (*rmap_list) { - rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; + /* Running here indicates it's vma has been UNMERGEABLE */ remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } + return NULL; +} + +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, + unsigned long addr) +{ + struct ksm_rmap_item *rmap_item; + + /* lookup if we have a old rmap_item matching the addr*/ + rmap_item = try_to_get_old_rmap_item(addr, rmap_list); + if (rmap_item) + return rmap_item; + + /* Need to allocate a new rmap_item */ rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ From 10bce5394c468e46ddc195f9fc98ac1c52b681e7 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 30 Dec 2022 09:13:57 +0800 Subject: [PATCH 021/143] ksm: support unsharing zero pages placed by KSM use_zero_pages may be very useful, not just because of cache colouring as described in doc, but also because use_zero_pages can accelerate merging empty pages when there are plenty of empty pages (full of zeros) as the time of page-by-page comparisons (unstable_tree_search_insert) is saved. But when enabling use_zero_pages, madvise(addr, len, MADV_UNMERGEABLE) and other ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing will *not* actually unshare the shared zeropage as placed by KSM (which is against the MADV_UNMERGEABLE documentation). As these KSM-placed zero pages are out of the control of KSM, the related counts of ksm pages don't expose how many zero pages are placed by KSM (these special zero pages are different from those initially mapped zero pages, because the zero pages mapped to MADV_UNMERGEABLE areas are expected to be a complete and unshared page) To not blindly unshare all shared zero_pages in applicable VMAs, the patch introduces a dedicated flag ZERO_PAGE_FLAG to mark the rmap_items of those shared zero_pages. and guarantee that these rmap_items will be not freed during the time of zero_pages not being writing, so we can only unshare the *KSM-placed* zero_pages. The patch will not degrade the performance of use_zero_pages as it doesn't change the way of merging empty pages in use_zero_pages's feature. Link: https://lkml.kernel.org/r/202212300913573751808@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Reported-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- mm/ksm.c | 141 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 111 insertions(+), 30 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 79ffadd18f8af..cae86f71349f0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -214,6 +214,7 @@ struct ksm_rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -420,6 +421,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +enum break_ksm_pmd_entry_return_flag { + HAVE_KSM_PAGE = 1, + HAVE_ZERO_PAGE +}; + static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -427,6 +433,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex spinlock_t *ptl; pte_t *pte; int ret; + bool is_zero_page = false; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return 0; @@ -434,6 +441,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte_present(*pte)) { page = vm_normal_page(walk->vma, addr, *pte); + if (!page) + is_zero_page = is_zero_pfn(pte_pfn(*pte)); } else if (!pte_none(*pte)) { swp_entry_t entry = pte_to_swp_entry(*pte); @@ -444,7 +453,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex if (is_migration_entry(entry)) page = pfn_swap_entry_to_page(entry); } - ret = page && PageKsm(page); + + if (page && PageKsm(page)) + ret = HAVE_KSM_PAGE; + else if (is_zero_page) + ret = HAVE_ZERO_PAGE; + else + ret = 0; + pte_unmap_unlock(pte, ptl); return ret; } @@ -466,19 +482,22 @@ static const struct mm_walk_ops break_ksm_ops = { * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, + bool unshare_zero_page) { vm_fault_t ret = 0; do { - int ksm_page; + int walk_result; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, + walk_result = walk_page_range_vma(vma, addr, addr + 1, &break_ksm_ops, NULL); - if (WARN_ON_ONCE(ksm_page < 0)) - return ksm_page; - if (!ksm_page) + if (WARN_ON_ONCE(walk_result < 0)) + return walk_result; + if (!walk_result) + return 0; + if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, @@ -539,7 +558,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr); + break_ksm(vma, addr, false); mmap_read_unlock(mm); } @@ -764,6 +783,30 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, return NULL; } +/* + * Cleaning the rmap_item's ZERO_PAGE_FLAG + * This function will be called when unshare or writing on zero pages. + */ +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) +{ + if (rmap_item->address & ZERO_PAGE_FLAG) + rmap_item->address &= PAGE_MASK; +} + +/* Only called when rmap_item is going to be freed */ +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item) +{ + struct vm_area_struct *vma; + + if (rmap_item->address & ZERO_PAGE_FLAG) { + vma = vma_lookup(rmap_item->mm, rmap_item->address); + if (vma && !ksm_test_exit(rmap_item->mm)) + break_ksm(vma, rmap_item->address, true); + } + /* Put at last. */ + clean_rmap_item_zero_flag(rmap_item); +} + /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. @@ -824,6 +867,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); + unshare_zero_pages(rmap_item); free_rmap_item(rmap_item); } } @@ -853,7 +897,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, if (signal_pending(current)) err = -ERESTARTSYS; else - err = break_ksm(vma, addr); + err = break_ksm(vma, addr, false); } return err; } @@ -2043,6 +2087,39 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, rmap_item->mm->ksm_merging_pages++; } +static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, + struct page *page) +{ + struct mm_struct *mm = rmap_item->mm; + int err = 0; + + /* + * It should not take ZERO_PAGE_FLAG because on one hand, + * get_next_rmap_item don't return zero pages' rmap_item. + * On the other hand, even if zero page was writen as + * anonymous page, rmap_item has been cleaned after + * stable_tree_search + */ + if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) { + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, rmap_item->address); + if (vma) { + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + if (!err) + rmap_item->address |= ZERO_PAGE_FLAG; + } else { + /* If the vma is out of date, we do not need to continue. */ + err = 0; + } + mmap_read_unlock(mm); + } + + return err; +} + /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can @@ -2054,7 +2131,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { - struct mm_struct *mm = rmap_item->mm; struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; @@ -2091,6 +2167,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } remove_rmap_item_from_tree(rmap_item); + clean_rmap_item_zero_flag(rmap_item); if (kpage) { if (PTR_ERR(kpage) == -EBUSY) @@ -2127,29 +2204,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ - if (ksm_use_zero_pages && (checksum == zero_checksum)) { - struct vm_area_struct *vma; - - mmap_read_lock(mm); - vma = find_mergeable_vma(mm, rmap_item->address); - if (vma) { - err = try_to_merge_one_page(vma, page, - ZERO_PAGE(rmap_item->address)); - } else { + if (ksm_use_zero_pages) { + if (checksum == zero_checksum) /* - * If the vma is out of date, we do not need to - * continue. + * In case of failure, the page was not really empty, so we + * need to continue. Otherwise we're done. */ - err = 0; - } - mmap_read_unlock(mm); - /* - * In case of failure, the page was not really empty, so we - * need to continue. Otherwise we're done. - */ - if (!err) - return; + if (!try_to_merge_with_kernel_zero_page(rmap_item, page)) + return; } + tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { @@ -2225,6 +2289,7 @@ static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr, *rmap_list = rmap_item->rmap_list; /* Running here indicates it's vma has been UNMERGEABLE */ remove_rmap_item_from_tree(rmap_item); + unshare_zero_pages(rmap_item); free_rmap_item(rmap_item); } @@ -2349,6 +2414,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) } if (is_zone_device_page(*page)) goto next_page; + if (is_zero_pfn(page_to_pfn(*page))) { + /* + * To monitor ksm zero pages which becomes non-anonymous, + * we have to save each rmap_item of zero pages by + * try_to_get_old_rmap_item() walking on + * ksm_scan.rmap_list, otherwise their rmap_items will be + * freed by the next turn of get_next_rmap_item(). The + * function get_next_rmap_item() will free all "skipped" + * rmap_items because it thinks its areas as UNMERGEABLE. + */ + rmap_item = try_to_get_old_rmap_item(ksm_scan.address, + ksm_scan.rmap_list); + if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG)) + ksm_scan.rmap_list = &rmap_item->rmap_list; + goto next_page; + } if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); From 1a577b906ef72e21692879ec46e87479eb2b2725 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 30 Dec 2022 09:15:14 +0800 Subject: [PATCH 022/143] ksm: count all zero pages placed by KSM As pages_sharing and pages_shared don't include the number of zero pages merged by KSM, we cannot know how many pages are zero pages placed by KSM when enabling use_zero_pages, which leads to KSM not being transparent with all actual merged pages by KSM. In the early days of use_zero_pages, zero-pages was unable to get unshared by the ways like MADV_UNMERGEABLE so it's hard to count how many times one of those zeropages was then unmerged. But now, unsharing KSM-placed zero page accurately has been achieved, so we can easily count both how many times a page full of zeroes was merged with zero-page and how many times one of those pages was then unmerged. and so, it helps to estimate memory demands when each and every shared page could get unshared. So we add zero_pages_sharing under /sys/kernel/mm/ksm/ to show the number of all zero pages placed by KSM. Link: https://lkml.kernel.org/r/202212300915147801864@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- mm/ksm.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index cae86f71349f0..cd6dd98b8e8ec 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -276,6 +276,9 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* The number of zero pages placed by KSM use_zero_pages */ +static unsigned long ksm_zero_pages_sharing; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -789,8 +792,10 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, */ static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) { - if (rmap_item->address & ZERO_PAGE_FLAG) + if (rmap_item->address & ZERO_PAGE_FLAG) { + ksm_zero_pages_sharing--; rmap_item->address &= PAGE_MASK; + } } /* Only called when rmap_item is going to be freed */ @@ -2108,8 +2113,10 @@ static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); - if (!err) + if (!err) { rmap_item->address |= ZERO_PAGE_FLAG; + ksm_zero_pages_sharing++; + } } else { /* If the vma is out of date, we do not need to continue. */ err = 0; @@ -3226,6 +3233,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t zero_pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%ld\n", ksm_zero_pages_sharing); +} +KSM_ATTR_RO(zero_pages_sharing); + static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3281,6 +3295,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &zero_pages_sharing_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, From 1d0ddd308e3cf4d55cc94de598f6c76400f6cc1d Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 30 Dec 2022 09:16:29 +0800 Subject: [PATCH 023/143] ksm: count zero pages for each process As the number of ksm zero pages is not included in ksm_merging_pages per process when enabling use_zero_pages, it's unclear of how many actual pages are merged by KSM. To let users accurately estimate their memory demands when unsharing KSM zero-pages, it's necessary to show KSM zero- pages per process. Since unsharing zero pages placed by KSM accurately is achieved, then tracking empty pages merging and unmerging is not a difficult thing any longer. Since we already have /proc//ksm_stat, just add the information of zero_pages_sharing in it. Link: https://lkml.kernel.org/r/202212300916292181912@zte.com.cn Signed-off-by: xu xin Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Xuexin Jiang Cc: Xiaokai Ran Cc: Yang Yang Signed-off-by: Andrew Morton --- fs/proc/base.c | 1 + include/linux/mm_types.h | 7 ++++++- mm/ksm.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b1..ac9ebe972be0b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "zero_pages_sharing %lu\n", mm->ksm_zero_pages_sharing); mmput(mm); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 452920467223b..a689198caf740 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -721,7 +721,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM - * merging. + * merging (not including ksm_zero_pages_sharing). */ unsigned long ksm_merging_pages; /* @@ -729,6 +729,11 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages_sharing; #endif #ifdef CONFIG_LRU_GEN struct { diff --git a/mm/ksm.c b/mm/ksm.c index cd6dd98b8e8ec..56808e3bfd194 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -794,6 +794,7 @@ static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & ZERO_PAGE_FLAG) { ksm_zero_pages_sharing--; + rmap_item->mm->ksm_zero_pages_sharing--; rmap_item->address &= PAGE_MASK; } } @@ -2116,6 +2117,7 @@ static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, if (!err) { rmap_item->address |= ZERO_PAGE_FLAG; ksm_zero_pages_sharing++; + rmap_item->mm->ksm_zero_pages_sharing++; } } else { /* If the vma is out of date, we do not need to continue. */ From 8b721d1675865bc631f0043b093211dc19506ae2 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 30 Dec 2022 09:17:28 +0800 Subject: [PATCH 024/143] ksm: add zero_pages_sharing documentation When enabling use_zero_pages, pages_sharing cannot represent how much memory saved indeed. zero_pages_sharing + pages_sharing does. add the description of zero_pages_sharing. Link: https://lkml.kernel.org/r/202212300917284911971@zte.com.cn Signed-off-by: xu xin Cc: Xiaokai Ran Cc: Yang Yang Cc: Jiang Xuexin Cc: Claudio Imbrenda Cc: David Hildenbrand Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index fb6ba2002a4b2..f160f9487a90f 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains the number of KSM pages that hit the ``max_page_sharing`` limit stable_node_dups number of duplicated KSM pages +zero_pages_sharing + how many empty pages are sharing kernel zero page(s) instead of + with each other as it would happen normally. Only effective when + enabling ``use_zero_pages`` knob. + +When enabling ``use_zero_pages``, the sum of ``pages_sharing`` + +``zero_pages_sharing`` represents how much really saved by KSM. A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` From d8c9c370612c550a0da6544edb786f40fb2651e8 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 30 Dec 2022 09:18:47 +0800 Subject: [PATCH 025/143] selftest: add testing unsharing and counting ksm zero page Add a function test_unmerge_zero_page() to test the functionality on unsharing and counting ksm-placed zero pages and counting of this patch series. test_unmerge_zero_page() actually contains three subjct test objects: 1) whether the count of ksm zero page can react correctly to cow (copy on write); 2) whether the count of ksm zero page can react correctly to unmerge; 3) whether ksm zero pages are really unmerged. Link: https://lkml.kernel.org/r/202212300918477352037@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 103 +++++++++++++++++- 1 file changed, 99 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d8b5b49304121..6acbd08e73e7b 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -27,6 +27,8 @@ static int ksm_fd; static int ksm_full_scans_fd; +static int ksm_zero_pages_fd; +static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; @@ -57,6 +59,22 @@ static bool range_maps_duplicates(char *addr, unsigned long size) return false; } +static bool check_ksm_zero_pages_count(unsigned long zero_size) +{ + unsigned long pages_expected = zero_size / (4 * KiB); + char buf[20]; + ssize_t read_size; + unsigned long ksm_zero_pages; + + read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0); + if (read_size < 0) + return -errno; + buf[read_size] = 0; + ksm_zero_pages = strtol(buf, NULL, 10); + + return ksm_zero_pages == pages_expected; +} + static long ksm_get_full_scans(void) { char buf[10]; @@ -70,15 +88,12 @@ static long ksm_get_full_scans(void) return strtol(buf, NULL, 10); } -static int ksm_merge(void) +static int wait_two_full_scans(void) { long start_scans, end_scans; - /* Wait for two full scans such that any possible merging happened. */ start_scans = ksm_get_full_scans(); if (start_scans < 0) - return start_scans; - if (write(ksm_fd, "1", 1) != 1) return -errno; do { end_scans = ksm_get_full_scans(); @@ -89,6 +104,34 @@ static int ksm_merge(void) return 0; } +static inline int ksm_merge(void) +{ + /* Wait for two full scans such that any possible merging happened. */ + if (write(ksm_fd, "1", 1) != 1) + return -errno; + return wait_two_full_scans(); +} + +static inline int make_cow(char *map, char val, unsigned long size) +{ + + memset(map, val, size); + return wait_two_full_scans(); +} + +static int unmerge_zero_page(char *start, unsigned long size) +{ + int ret; + + ret = madvise(start, size, MADV_UNMERGEABLE); + if (ret) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + return ret; + } + + return wait_two_full_scans(); +} + static char *mmap_and_merge_range(char val, unsigned long size) { char *map; @@ -146,6 +189,56 @@ static void test_unmerge(void) munmap(map, size); } +static void test_unmerge_zero_pages(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + /* Confirm the interfaces*/ + ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY); + if (ksm_zero_pages_fd < 0) { + ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n"); + return; + } + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + if (ksm_use_zero_pages_fd < 0) { + ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + + /* Mmap zero pages*/ + map = mmap_and_merge_range(0x00, size); + + /* Case 1: make Writing on ksm zero pages (COW) */ + if (make_cow(map, 0xcf, size / 2)) { + ksft_test_result_fail("COW failed\n"); + goto unmap; + } + ksft_test_result(check_ksm_zero_pages_count(size / 2), + "zero page count react to cow\n"); + + /* Case 2: Call madvise(xxx, MADV_UNMERGEABLE)*/ + if (unmerge_zero_page(map + size / 2, size / 4)) { + ksft_test_result_fail("unmerge_zero_page failed\n"); + goto unmap; + } + ksft_test_result(check_ksm_zero_pages_count(size / 4), + "zero page count react to unmerge\n"); + + /*Check if ksm pages are really unmerged */ + ksft_test_result(!range_maps_duplicates(map + size / 2, size / 4), + "KSM zero pages were unmerged\n"); + +unmap: + munmap(map, size); +} + static void test_unmerge_discarded(void) { const unsigned int size = 2 * MiB; @@ -261,11 +354,13 @@ int main(int argc, char **argv) ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); if (ksm_full_scans_fd < 0) ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); test_unmerge(); + test_unmerge_zero_pages(); test_unmerge_discarded(); #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); From 6929b54806879d4b1ed124fb837e71b999c19ed5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:02 -0500 Subject: [PATCH 026/143] maple_tree: add mas_init() function Patch series "VMA tree type safety and remove __vma_adjust()", v4. This patchset does two things: 1. Clean up, including removal of __vma_adjust() and 2. Extends the VMA iterator API to provide type safety to the VMA operations using the maple tree, as requested by Linus [1]. It also addresses another issue of usability brought up by Linus about needing to modify the maple state within the loops. The maple state has been replaced by the VMA iterator and the iterator is now modified within the MM code so the caller should not need to worry about doing the work themselves when tree modifications occur. This brought up a potential inconsistency of the iterator state and what the user expects, so the inconsistency is addressed to keep the VMA iterator safe for use after the looping over a VMA range. This is addressed in patch 3 ("maple_tree: Reduce user error potential") and 4 ("test_maple_tree: Test modifications while iterating"). While cleaning up the state, the duplicate locking code in mm/mmap.c introduced by the maple tree has been address by abstracting it to two functions: vma_prepare() and vma_complete(). These abstractions allowed for a much simpler __vma_adjust(), which eventually leads to the removal of the __vma_adjust() function by placing the logic into the vma_merge() function itself. 1. https://lore.kernel.org/linux-mm/CAHk-=wg9WQXBGkNdKD2bqocnN73rDswuWsavBB7T-tekykEn_A@mail.gmail.com/ This patch (of 49): Add a function that will zero out the maple state struct and set some basic defaults. Link: https://lkml.kernel.org/r/20230120162650.984577-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a7bf58fd7cc6d..1fadb5f5978b6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -432,6 +432,7 @@ struct ma_wr_state { .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ + .mas_flags = 0, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ @@ -470,6 +471,16 @@ void *mas_next(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); +static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, + unsigned long addr) +{ + memset(mas, 0, sizeof(struct ma_state)); + mas->tree = tree; + mas->index = mas->last = addr; + mas->max = ULONG_MAX; + mas->node = MAS_START; +} + /* Checks if a mas has not found anything */ static inline bool mas_is_none(struct ma_state *mas) { From 8d1df0d609381e845be1ad78ebceb9e51a4f23da Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:03 -0500 Subject: [PATCH 027/143] maple_tree: fix potential rcu issue Ensure the node isn't dead after reading the node end. Link: https://lkml.kernel.org/r/20230120162650.984577-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1c5d3b640a246..7e3cf5b7e68b3 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4655,13 +4655,13 @@ static inline void *mas_next_nentry(struct ma_state *mas, pivots = ma_pivots(node, type); slots = ma_slots(node, type); mas->index = mas_safe_min(mas, pivots, mas->offset); + count = ma_data_end(node, type, pivots, mas->max); if (ma_dead_node(node)) return NULL; if (mas->index > max) return NULL; - count = ma_data_end(node, type, pivots, mas->max); if (mas->offset > count) return NULL; From 9d8c311d66d1cb0c224cc681f27287733b6f9fc3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:04 -0500 Subject: [PATCH 028/143] maple_tree: reduce user error potential When iterating, a user may operate on the tree and cause the maple state to be altered and left in an unintuitive state. Detect this scenario and correct it by setting to the limit and invalidating the state. Link: https://lkml.kernel.org/r/20230120162650.984577-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7e3cf5b7e68b3..5804c5997598d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4732,6 +4732,11 @@ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) unsigned long last; enum maple_type mt; + if (mas->index > limit) { + mas->index = mas->last = limit; + mas_pause(mas); + return NULL; + } last = mas->last; retry: offset = mas->offset; @@ -4838,6 +4843,11 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) { void *entry; + if (mas->index < min) { + mas->index = mas->last = min; + mas_pause(mas); + return NULL; + } retry: while (likely(!mas_is_none(mas))) { entry = mas_prev_nentry(mas, min, mas->index); From 516651d84dbd2c0379a486ff16b7b7589e167fd2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:05 -0500 Subject: [PATCH 029/143] test_maple_tree: test modifications while iterating Add a testcase to ensure the iterator detects bad states on modifications and does what the user expects Link: https://lkml.kernel.org/r/20230120162650.984577-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 72 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ec847bf4dcb4d..3d19b1f78d718 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1709,6 +1709,74 @@ static noinline void check_forking(struct maple_tree *mt) mtree_destroy(&newmt); } +static noinline void check_iteration(struct maple_tree *mt) +{ + int i, nr_entries = 125; + void *val; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i * 10, i * 10 + 9, + xa_mk_value(i), GFP_KERNEL); + + mt_set_non_kernel(99999); + + i = 0; + mas_lock(&mas); + mas_for_each(&mas, val, 925) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 92 */ + if (i == 92) { + mas.index = 925; + mas.last = 929; + mas_store(&mas, val); + } + i++; + } + /* Ensure mas_find() gets the next value */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 785) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite start of entry 78 */ + if (i == 78) { + mas.index = 780; + mas.last = 785; + mas_store(&mas, val); + } else { + i++; + } + } + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 765) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 76 and advance to the end */ + if (i == 76) { + mas.index = 760; + mas.last = 765; + mas_store(&mas, val); + mas_next(&mas, ULONG_MAX); + } + i++; + } + /* Make sure the next find returns the one after 765, 766-769 */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(76)); + mas_unlock(&mas); + mas_destroy(&mas); + mt_set_non_kernel(0); +} + static noinline void check_mas_store_gfp(struct maple_tree *mt) { @@ -2659,6 +2727,10 @@ static int maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_iteration(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_forking(&tree); mtree_destroy(&tree); From faf2cd3e91bcd65f3f63d4800981d99e5a7b6573 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:06 -0500 Subject: [PATCH 030/143] maple_tree: fix handle of invalidated state in mas_wr_store_setup() If an invalidated maple state is encountered during write, reset the maple state to MAS_START. This will result in a re-walk of the tree to the correct location for the write. Link: https://lore.kernel.org/all/20230107020126.1627-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20230120162650.984577-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: SeongJae Park Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5804c5997598d..7c786bd5e575a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5609,6 +5609,9 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (unlikely(mas_is_paused(wr_mas->mas))) + mas_reset(wr_mas->mas); + if (!mas_is_start(wr_mas->mas)) { if (mas_is_none(wr_mas->mas)) { mas_reset(wr_mas->mas); From c07eabf5bc0931537f66becc5b92ad904187dd3c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:07 -0500 Subject: [PATCH 031/143] maple_tree: fix mas_prev() and mas_find() state handling When mas_prev() does not find anything, set the state to MAS_NONE. Handle the MAS_NONE in mas_find() like a MAS_START. Link: https://lkml.kernel.org/r/20230120162650.984577-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7c786bd5e575a..5e97031892590 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4845,7 +4845,7 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) if (mas->index < min) { mas->index = mas->last = min; - mas_pause(mas); + mas->node = MAS_NONE; return NULL; } retry: @@ -5919,6 +5919,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (!mas->index) { /* Nothing comes before 0 */ mas->last = 0; + mas->node = MAS_NONE; return NULL; } @@ -6009,6 +6010,9 @@ void *mas_find(struct ma_state *mas, unsigned long max) mas->index = ++mas->last; } + if (unlikely(mas_is_none(mas))) + mas->node = MAS_START; + if (unlikely(mas_is_start(mas))) { /* First run or continue */ void *entry; From 3b6328430c308663a0f46dcec47a995fc4980d1b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:08 -0500 Subject: [PATCH 032/143] mm: expand vma iterator interface Add wrappers for the maple tree to the vma iterator. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 ++++++++++++++++++++++++++--- include/linux/mm_types.h | 4 +-- mm/internal.h | 64 ++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 18 +++++++++++ 4 files changed, 125 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2cabf3ab4157f..54a3f117b48fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -676,16 +676,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { - return mas_find(&vmi->mas, max); + return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* - * Uses vma_find() to get the first VMA when the iterator starts. + * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ - return vma_find(vmi, ULONG_MAX); + return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) @@ -698,12 +698,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) return vmi->mas.index; } +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a689198caf740..2d6d790d9bed8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -854,9 +854,7 @@ struct vma_iterator { static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { - vmi->mas.tree = &mm->mm_mt; - vmi->mas.index = addr; - vmi->mas.node = MAS_START; + mas_init(&vmi->mas, &mm->mm_mt, addr); } struct mmu_gather; diff --git a/mm/internal.h b/mm/internal.h index 2d1b9fa8083e4..ffd65248f266c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -877,4 +877,68 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi) +{ + return mas_preallocate(&vmi->mas, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi, + unsigned long start, unsigned long end) +{ + mas_set_range(&vmi->mas, start, end - 1); + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { + printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { + printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } +#endif + + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_prealloc(&vmi->mas, vma); +} + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) +{ + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index ffc0815cd7fb7..db70f3e2181e1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -144,6 +144,24 @@ static void remove_vma(struct vm_area_struct *vma) vm_area_free(vma); } +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, + unsigned long min) +{ + return mas_prev(&vmi->mas, min); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + vmi->mas.index = start; + vmi->mas.last = end - 1; + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. From 1a9120ca6134120112495c81d4e602475dec5631 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:09 -0500 Subject: [PATCH 033/143] mm/mmap: convert brk to use vma iterator Use the vma iterator API for the brk() system call. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index db70f3e2181e1..94a477a551094 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -180,10 +180,10 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf); -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -194,7 +194,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool populate; bool downgraded = false; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -242,8 +242,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) int ret; /* Search one past newbrk */ - mas_set(&mas, newbrk); - brkvma = mas_find(&mas, oldbrk); + vma_iter_init(&vmi, mm, newbrk); + brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* @@ -252,7 +252,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * before calling do_brk_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + ret = do_brk_munmap(&vmi, brkvma, newbrk, oldbrk, &uf); if (ret == 1) { downgraded = true; goto success; @@ -270,14 +270,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ - mas_set(&mas, oldbrk); - next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); + vma_iter_init(&vmi, mm, oldbrk); + next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; - brkvma = mas_prev(&mas, mm->start_brk); + brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; @@ -2917,8 +2917,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } /* - * brk_munmap() - Unmap a partial vma. - * @mas: The maple tree state. + * brk_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator * @vma: The vma to be modified * @newbrk: the start of the address to unmap * @oldbrk: The end of the address to unmap @@ -2928,7 +2928,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if * possible. */ -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf) { @@ -2936,14 +2936,14 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); + ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. - * @mas: The maple tree state. + * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, @@ -2953,7 +2953,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -2980,8 +2980,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); @@ -2991,7 +2990,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, } vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; - mas_store_prealloc(mas, vma); + vma_iter_store(vmi, vma); if (vma->anon_vma) { anon_vma_interval_tree_post_update_vma(vma); @@ -3012,8 +3011,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_store_gfp(mas, vma, GFP_KERNEL)) + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; @@ -3042,7 +3040,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) int ret; bool populate; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) @@ -3061,12 +3059,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; - vma = mas_prev(&mas, 0); - ret = do_brk_flags(&mas, vma, addr, len, flags); + vma = vma_prev(&vmi); + ret = do_brk_flags(&vmi, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); From e210668ef7bae043e4de172eeca987737860314e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:10 -0500 Subject: [PATCH 034/143] kernel/fork: convert forking to using the vmi iterator Avoid using the maple tree interface directly. This gains type safety. Link: https://lkml.kernel.org/r/20230120162650.984577-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/fork.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe35418978..441dcec60aae7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(old_vmi, oldmm, 0); + VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); + retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); if (retval) goto out; - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + for_each_vma(old_vmi, mpnt) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ - mas.index = tmp->vm_start; - mas.last = tmp->vm_end - 1; - mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; + if (vma_iter_bulk_store(&vmi, tmp)) + goto fail_nomem_vmi_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: - mas_destroy(&mas); + vma_iter_free(&vmi); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -712,7 +709,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: +fail_nomem_vmi_store: unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); From 06381963e09d32f3ef9d8d14fb08a8d47560dc77 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:11 -0500 Subject: [PATCH 035/143] mmap: convert vma_link() vma iterator Avoid using the maple tree interface directly. Link: https://lkml.kernel.org/r/20230120162650.984577-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 94a477a551094..17de99f31ff50 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -487,10 +487,10 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, GFP_KERNEL)) + if (vma_iter_prealloc(&vmi)) return -ENOMEM; if (vma->vm_file) { @@ -498,7 +498,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) i_mmap_lock_write(mapping); } - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); if (mapping) { __vma_link_file(vma, mapping); From f0e3a34e09dbc7d11d8519639585d40cfa0c1674 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:12 -0500 Subject: [PATCH 036/143] mm/mmap: remove preallocation from do_mas_align_munmap() In preparation of passing the vma state through split, the pre-allocation that occurs before the split has to be moved to after. Since the preallocation would then live right next to the store, just call store instead of preallocating. This effectively restores the potential error path of splitting and not munmap'ing which pre-dates the maple tree. Link: https://lkml.kernel.org/r/20230120162650.984577-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 17de99f31ff50..d46a798c38ab3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2329,9 +2329,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, GFP_KERNEL)) - return -ENOMEM; - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. @@ -2422,8 +2419,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* Point of no return */ - mas_set_range(mas, start, end - 1); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { @@ -2431,6 +2426,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; + mas_set_range(mas, start, end - 1); rcu_read_lock(); vma_test = mas_find(&test, end - 1); mas_for_each(mas, vma_mas, end - 1) { @@ -2440,10 +2436,13 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } rcu_read_unlock(); BUG_ON(count != test_count); - mas_set_range(mas, start, end - 1); } #endif - mas_store_prealloc(mas, NULL); + /* Point of no return */ + mas_set_range(mas, start, end - 1); + if (mas_store_gfp(mas, NULL, GFP_KERNEL)) + return -ENOMEM; + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2475,7 +2474,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: - mas_destroy(mas); return error; } From 20180ba011e911a3c4e895494df775245fb694a0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:13 -0500 Subject: [PATCH 037/143] mmap: change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator Start passing the vma iterator through the mm code. This will allow for reuse of the state and cleaner invalidation if necessary. Link: https://lkml.kernel.org/r/20230120162650.984577-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/mmap.c | 77 +++++++++++++++++++++------------------------- mm/mremap.c | 6 ++-- 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 54a3f117b48fd..e46b8f8b6b355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2911,7 +2911,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, diff --git a/mm/mmap.c b/mm/mmap.c index d46a798c38ab3..5b83023ba6a6d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2305,8 +2305,8 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, } /* - * do_mas_align_munmap() - munmap the aligned region from @start to @end. - * @mas: The maple_state, ideally set up to alter the correct tree location. + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. + * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. @@ -2317,7 +2317,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, * If @downgrade is true, check return code for potential release of the lock. */ static int -do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { @@ -2329,7 +2329,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2349,27 +2348,23 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - /* - * mas_pause() is not needed since mas->index needs to be set - * differently than vma->vm_end anyways. - */ error = __split_vma(mm, vma, start, 0); if (error) goto start_split_failed; - mas_set(mas, start); - vma = mas_walk(mas); + vma_iter_set(vmi, start); + vma = vma_find(vmi, end); } - prev = mas_prev(mas, 0); + prev = vma_prev(vmi); if (unlikely((!prev))) - mas_set(mas, start); + vma_iter_set(vmi, start); /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ - mas_for_each(mas, next, end - 1) { + for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { struct vm_area_struct *split; @@ -2378,8 +2373,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (error) goto end_split_failed; - mas_set(mas, end); - split = mas_prev(mas, 0); + vma_iter_set(vmi, end); + split = vma_prev(vmi); error = munmap_sidetree(split, &mas_detach); if (error) goto munmap_sidetree_failed; @@ -2401,7 +2396,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } if (!next) - next = mas_next(mas, ULONG_MAX); + next = vma_next(vmi); if (unlikely(uf)) { /* @@ -2426,10 +2421,10 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; - mas_set_range(mas, start, end - 1); + vma_iter_set(vmi, start); rcu_read_lock(); vma_test = mas_find(&test, end - 1); - mas_for_each(mas, vma_mas, end - 1) { + for_each_vma_range(*vmi, vma_mas, end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, end - 1); @@ -2439,8 +2434,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } #endif /* Point of no return */ - mas_set_range(mas, start, end - 1); - if (mas_store_gfp(mas, NULL, GFP_KERNEL)) + vma_iter_set(vmi, start); + if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) return -ENOMEM; mm->map_count -= count; @@ -2478,8 +2473,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } /* - * do_mas_munmap() - munmap a given range. - * @mas: The maple state + * do_vmi_munmap() - munmap a given range. + * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap @@ -2493,7 +2488,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. */ -int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade) { @@ -2511,11 +2506,11 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, arch_unmap(mm, start, end); /* Find the first overlapping VMA */ - vma = mas_find(mas, end - 1); + vma = vma_find(vmi, end); if (!vma) return 0; - return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. @@ -2527,9 +2522,9 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); - return do_mas_munmap(&mas, mm, start, len, uf, false); + return do_vmi_munmap(&vmi, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2545,7 +2540,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long merge_start = addr, merge_end = end; pgoff_t vm_pgoff; int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); + VMA_ITERATOR(vmi, mm, addr); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -2563,7 +2558,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_mas_munmap(&mas, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2576,8 +2571,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); + next = vma_next(&vmi); + prev = vma_prev(&vmi); if (vm_flags & VM_SPECIAL) goto cannot_expand; @@ -2605,13 +2600,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } - mas.index = addr; - mas.last = end - 1; cannot_expand: /* * Determine the object being mapped and call the appropriate @@ -2650,7 +2643,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, error = -EINVAL; goto close_and_free_vma; } - mas_reset(&mas); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge @@ -2706,7 +2699,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2719,7 +2712,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2780,7 +2773,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end); if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: @@ -2797,12 +2790,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2934,7 +2927,7 @@ static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true); + ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } @@ -3057,7 +3050,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0); + ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; diff --git a/mm/mremap.c b/mm/mremap.c index 05f90f47e1494..3cc64c3f8bdb3 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -978,14 +978,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_mas_munmap does all the needed commit accounting, and + * do_vmi_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; - MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); + VMA_ITERATOR(vmi, mm, addr + new_len); - retval = do_mas_munmap(&mas, mm, addr + new_len, + retval = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ if (retval == 1) { From c3e752d9436daa83c150b011baba4f018bf86b02 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:14 -0500 Subject: [PATCH 038/143] mmap: convert vma_expand() to use vma iterator Use the vma iterator instead of the maple state for type safety and for consistency through the mm code. Link: https://lkml.kernel.org/r/20230120162650.984577-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5b83023ba6a6d..7e406416cf47f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -527,7 +527,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) * * Returns: 0 on success */ -inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, +inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { @@ -556,7 +556,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -581,8 +581,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; - /* Note: mas must be pointing to the expanding VMA */ - vma_mas_store(vma, mas); + vma_iter_store(vmi, vma); if (file) { vma_interval_tree_insert(vma, root); @@ -2600,7 +2599,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } From d8d04862842ea460055cdbee8d11f172a70a0bfa Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:15 -0500 Subject: [PATCH 039/143] mm: add temporary vma iterator versions of vma_merge(), split_vma(), and __split_vma() These wrappers are short-lived in this patch set so that each user can be converted on its own. In the end, these functions are renamed in one commit. Link: https://lkml.kernel.org/r/20230120162650.984577-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++++- mm/mmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e46b8f8b6b355..ae89f1b9497f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2849,11 +2849,20 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); +extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); + unsigned long addr, int new_below); +extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); +extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/mmap.c b/mm/mmap.c index 7e406416cf47f..894017841d5d7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1091,6 +1091,25 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, return res; } +struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, + struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + struct vm_area_struct *tmp; + + tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, + policy, vm_userfaultfd_ctx, anon_name); + if (tmp) + vma_iter_set(vmi, end); + + return tmp; +} + /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. @@ -2276,6 +2295,18 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, validate_mm_mt(mm); return err; } +int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) +{ + int ret; + unsigned long end = vma->vm_end; + + ret = __split_vma(mm, vma, addr, new_below); + if (!ret) + vma_iter_set(vmi, end); + + return ret; +} /* * Split a vma into two pieces at address 'addr', a new vma is allocated @@ -2290,6 +2321,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } +int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) +{ + int ret; + unsigned long end = vma->vm_end; + + ret = split_vma(mm, vma, addr, new_below); + if (!ret) + vma_iter_set(vmi, end); + + return ret; +} + static inline int munmap_sidetree(struct vm_area_struct *vma, struct ma_state *mas_detach) { From 3be0b9277bdcb52fd0f73c00891764111079c776 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:16 -0500 Subject: [PATCH 040/143] ipc/shm: use the vma iterator for munmap calls Pass through the vma iterator to do_vmi_munmap() to handle the iterator state internally Link: https://lkml.kernel.org/r/20230120162650.984577-16-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- ipc/shm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index bd2fcc4d454e0..1c6a6b319a49e 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); + do_vmi_munmap(&vmi, mm, vma->vm_start, + vma->vm_end - vma->vm_start, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1810,10 +1810,9 @@ long ksys_shmdt(char __user *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) { - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); - } + (vma->vm_file == file)) + do_vmi_munmap(&vmi, mm, vma->vm_start, + vma->vm_end - vma->vm_start, NULL, false); vma = vma_next(&vmi); } From a256ad0ca0b660cc3a9d21432bdff1c708f2c4ff Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 26 Jan 2023 16:20:49 -0500 Subject: [PATCH 041/143] ipc/shm: introduce new do_vma_munmap() to munmap The shm already has the vma iterator in position for a write. do_vmi_munmap() searches for the correct position and aligns the write, so it is not the right function to use in this case. The shm VMA tree modification is similar to the brk munmap situation, the vma iterator is in position and the VMA is already known. This patch generalizes the brk munmap function do_brk_munmap() to be used for any other callers with the vma iterator already in position to munmap a VMA. Link: https://lkml.kernel.org/r/20230126212049.980501-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Sven Schnelle Link: https://lore.kernel.org/linux-mm/yt9dh6wec21a.fsf@linux.ibm.com/ Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ ipc/shm.c | 11 ++++++----- mm/mmap.c | 38 ++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ae89f1b9497f0..6cb7c24f2cf2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2928,6 +2928,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU +extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) diff --git a/ipc/shm.c b/ipc/shm.c index 1c6a6b319a49e..60e45e7045d44 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_vmi_munmap(&vmi, mm, vma->vm_start, - vma->vm_end - vma->vm_start, NULL, false); + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1810,9 +1810,10 @@ long ksys_shmdt(char __user *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) - do_vmi_munmap(&vmi, mm, vma->vm_start, - vma->vm_end - vma->vm_start, NULL, false); + (vma->vm_file == file)) { + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); + } vma = vma_next(&vmi); } diff --git a/mm/mmap.c b/mm/mmap.c index 894017841d5d7..408e9cc47333c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -180,9 +180,6 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf); static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) @@ -236,7 +233,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * do_brk_munmap() may downgrade mmap_lock to read. + * do_vma_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; @@ -248,11 +245,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_brk_munmap() may downgrade the lock, so update it - * before calling do_brk_munmap(). + * do_vma_munmap() may downgrade the lock, so update it + * before calling do_vma_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&vmi, brkvma, newbrk, oldbrk, &uf); + ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true); if (ret == 1) { downgraded = true; goto success; @@ -2951,26 +2948,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } /* - * brk_munmap() - Unmap a full or partial vma. - * @vmi: The vma iterator - * @vma: The vma to be modified - * @newbrk: the start of the address to unmap - * @oldbrk: The end of the address to unmap + * do_vma_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator pointing at the vma + * @vma: The first vma to be munmapped + * @start: the start of the address to unmap + * @end: The end of the address to unmap * @uf: The userfaultfd list_head + * @downgrade: Attempt to downgrade or not * - * Returns: 1 on success. - * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if - * possible. + * Returns: 0 on success and not downgraded, 1 on success and downgraded. + * unmaps a VMA mapping when the vma iterator is already in position. + * Does not handle alignment. */ -static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf) +int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade) { struct mm_struct *mm = vma->vm_mm; int ret; - arch_unmap(mm, newbrk, oldbrk); - ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true); + arch_unmap(mm, start, end); + ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); validate_mm_mt(mm); return ret; } From 4f1f7c046d34ac46c3c2dbdad64772b0174ebfcd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:17 -0500 Subject: [PATCH 042/143] userfaultfd: use vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-17-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 87 ++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 54 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 15a5bf765d432..4334bd35984d6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -883,7 +883,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); @@ -900,7 +900,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -909,13 +909,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { - mas_pause(&mas); vma = prev; } else { prev = vma; @@ -1302,7 +1301,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1344,17 +1343,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (!mmget_not_zero(mm)) goto out; + ret = -EINVAL; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); if (!vma) goto out_unlock; - /* check that there's at least one vma in the range */ - ret = -EINVAL; - if (vma->vm_start >= end) - goto out_unlock; - /* * If the first vma contains huge pages, make sure start address * is aligned to huge page size. @@ -1371,7 +1366,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1428,16 +1424,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, basic_ioctls = true; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags)); @@ -1458,30 +1452,25 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(&vmi, mm, vma, end, 0); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } next: /* @@ -1498,8 +1487,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1543,7 +1532,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1562,14 +1551,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); - if (!vma) - goto out_unlock; - - /* check that there's at least one vma in the range */ ret = -EINVAL; - if (vma->vm_start >= end) + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) goto out_unlock; /* @@ -1587,8 +1572,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - ret = -EINVAL; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1605,16 +1590,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); - + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1650,26 +1632,23 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; - mas_pause(&mas); goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret) break; - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(&vmi, mm, vma, end, 0); if (ret) break; - mas_pause(&mas); } next: /* @@ -1683,8 +1662,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); From 7b241693f7d07ce5436928ac0d0ab08443b3ab26 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:18 -0500 Subject: [PATCH 043/143] mm: change mprotect_fixup to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 5 ++++- include/linux/mm.h | 6 +++--- mm/mprotect.c | 47 ++++++++++++++++++++++------------------------ 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index ab913243a367b..b98647eeae9fe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; + struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +813,10 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; + vma_iter_init(&vmi, mm, vma->vm_start); + tlb_gather_mmu(&tlb, mm); - ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, + ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6cb7c24f2cf2b..843e8ada86efb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2203,9 +2203,9 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); -extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. diff --git a/mm/mprotect.c b/mm/mprotect.c index 6a22f3ad9b84d..39b6335b8813b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -585,9 +585,9 @@ static const struct mm_walk_ops prot_none_walk_ops = { }; int -mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags) +mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -642,7 +642,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(mm, *pprev, start, end, newflags, + *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, *pprev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); + error = vmi_split_vma(vmi, mm, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); + error = vmi_split_vma(vmi, mm, vma, end, 0); if (error) goto fail; } @@ -709,7 +709,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + struct vma_iterator vmi; start = untagged_addr(start); @@ -741,8 +741,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, current->mm, start); + vma = vma_find(&vmi, end); error = -ENOMEM; if (!vma) goto out; @@ -765,18 +765,22 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); - for (nstart = start ; ; ) { + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { unsigned long mask_off_old_flags; unsigned long newflags; int new_vma_pkey; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + if (vma->vm_start != tmp) { + error = -ENOMEM; + break; + } /* Does the application expect PROT_READ to imply PROT_EXEC */ if (rier && (vma->vm_flags & VM_MAYEXEC)) @@ -824,25 +828,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(current->mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } prot = reqprot; } tlb_finish_mmu(&tlb); + + if (vma_iter_end(&vmi) < end) + error = -ENOMEM; + out: mmap_write_unlock(current->mm); return error; From 1eed48f873fe0f90c8e55008ef0ab41d6b5f2c02 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:19 -0500 Subject: [PATCH 044/143] mlock: convert mlock to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mlock.c | 57 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b680f11879c3d..0d09b9070071c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -401,8 +401,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * * For vmas that pass the filters, merge/split as appropriate. */ -static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, vm_flags_t newflags) +static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; @@ -417,22 +418,22 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; } if (start != vma->vm_start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(vmi, mm, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(vmi, mm, vma, end, 0); if (ret) goto out; } @@ -471,7 +472,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; - MA_STATE(mas, ¤t->mm->mm_mt, start, start); + VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -480,39 +481,37 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); - for (nstart = start ; ; ) { - vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { + vm_flags_t newflags; - newflags |= flags; + if (vma->vm_start != tmp) + return -ENOMEM; + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(prev->vm_mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } } + + if (vma_iter_end(&vmi) < end) + return -ENOMEM; + return error; } @@ -658,7 +657,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,15 +678,15 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; newflags |= to_add; /* Ignore errors */ - mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - mas_pause(&mas); + mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); cond_resched(); } out: From 7851626868879736181ecf3318267c3617705e68 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:20 -0500 Subject: [PATCH 045/143] coredump: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/coredump.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991be..f27d734f31029 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1111,14 +1111,14 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, +static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { if (gate_vma && (vma == gate_vma)) return NULL; - vma = mas_next(mas, ULONG_MAX); + vma = vma_next(vmi); if (vma) return vma; return gate_vma; @@ -1146,7 +1146,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int i = 0; /* @@ -1167,7 +1167,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { + while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; From abb27118682a9b0b127f71841fee591049a20223 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:21 -0500 Subject: [PATCH 046/143] mempolicy: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mempolicy.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2202e76192b29..4f627b1bf0a1b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,24 +787,21 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); - - vma = mas_find(&mas, end - 1); + prev = vma_prev(&vmi); + vma = vma_find(&vmi, end); if (WARN_ON(!vma)) return 0; if (start > vma->vm_start) prev = vma; - for (; vma; vma = mas_next(&mas, end - 1)) { + do { unsigned long vmstart = max(start, vma->vm_start); unsigned long vmend = min(end, vma->vm_end); @@ -813,29 +810,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto replace; } if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } replace: err = vma_replace_policy(vma, new_pol); @@ -843,7 +834,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto out; next: prev = vma; - } + } for_each_vma_range(vmi, vma, end); out: return err; From ac93d821e1298093eabf64032cb9cc2df0e5c84c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:22 -0500 Subject: [PATCH 047/143] task_mmu: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Update the comments to how the vma iterator works. The vma iterator will keep track of the last vm_end and start the search from vm_end + 1. Link: https://lkml.kernel.org/r/20230120162650.984577-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a44339a77a75e..a944e1816364d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -890,7 +890,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) @@ -908,7 +908,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; @@ -923,7 +923,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { - mas_pause(&mas); + vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -948,31 +948,31 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * * 1) VMA2 is freed, but VMA3 exists: * - * find_vma(mm, 16k - 1) will return VMA3. + * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * - * find_vma(mm, 16k - 1) will return VMA2. - * Iterate the loop like the original one. + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * - * find_vma(mm, 16k - 1) will return NULL. + * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * - * find_vma(mm, 16k - 1) will return VMA' whose range + * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; - /* Case 1 above */ + /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) continue; @@ -980,8 +980,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) if (vma->vm_end > last_vma_end) smap_gather_stats(vma, &mss, last_vma_end); } - /* Case 2 above */ - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); @@ -1277,7 +1276,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1297,7 +1296,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; From 119a68c1a73d14ed9cd27093d4ddd41d854387e9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:23 -0500 Subject: [PATCH 048/143] sched: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c36aa54ae071a..9c9950249d7b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; - MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = mas_find(&mas, ULONG_MAX)) { + do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); - } + } for_each_vma(vmi, vma); out: /* From dd860aa958397e996986a6187473403aff9e194b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:24 -0500 Subject: [PATCH 049/143] madvise: use vmi iterator for __split_vma() and vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 92a3c6bd84c1c..4d4471916465f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,6 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; + VMA_ITERATOR(vmi, mm, 0); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; @@ -149,8 +150,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), + *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; @@ -162,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = __split_vma(mm, vma, start, 1); + error = vmi__split_vma(&vmi, mm, vma, start, 1); if (error) return error; } @@ -170,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = __split_vma(mm, vma, end, 0); + error = vmi__split_vma(&vmi, mm, vma, end, 0); if (error) return error; } From 8810f5a4e352a881f19e4eeae24fe0a0ae3905d5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:25 -0500 Subject: [PATCH 050/143] mmap: pass through vmi iterator to __split_vma() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 408e9cc47333c..2b588b831ead6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2388,7 +2388,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = __split_vma(mm, vma, start, 0); + error = vmi__split_vma(vmi, mm, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2409,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = __split_vma(mm, next, end, 1); + error = vmi__split_vma(vmi, mm, next, end, 1); if (error) goto end_split_failed; From e4542928d9244bbb8160e1e0fd0feec3a7e0f48a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:26 -0500 Subject: [PATCH 051/143] mmap: use vmi version of vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 2b588b831ead6..8806bfbaa5055 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2690,8 +2690,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, vma->vm_file, + vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3232,6 +3233,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; bool faulted_in_anon_vma = true; + VMA_ITERATOR(vmi, mm, addr); validate_mm_mt(mm); /* @@ -3247,7 +3249,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { From 512776f84e99c226ab42a92e3cd219894fd7b3de Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:27 -0500 Subject: [PATCH 052/143] mm/mremap: use vmi version of vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mremap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 3cc64c3f8bdb3..f161516ab3c1e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1018,6 +1018,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long extension_end = addr + new_len; pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); + VMA_ITERATOR(vmi, mm, extension_start); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1042,10 +1043,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vma_merge(mm, vma, extension_start, extension_end, - vma->vm_flags, vma->anon_vma, vma->vm_file, - extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vmi_vma_merge(&vmi, mm, vma, + extension_start, extension_end, + vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; From 2c03c1fa7b01fa9cb34d15d32f38689d1d470226 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:28 -0500 Subject: [PATCH 053/143] nommu: convert nommu to using the vma iterator Gain type safety in nommu by using the vma_iterator and not the maple tree directly. Link: https://lkml.kernel.org/r/20230120162650.984577-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/nommu.c | 79 +++++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 0481922fe66e4..7a52a7c370099 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -544,19 +544,6 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { vma->vm_mm = mm; @@ -574,13 +561,13 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } /* - * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). - * @mas: The maple state with preallocations. + * vmi_add_vma_to_mm() - VMA Iterator variant of add_vmi_to_mm(). + * @vmi: The VMA iterator * @mm: The mm_struct * @vma: The vma to add * */ -static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, +static void vmi_add_vma_to_mm(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *vma) { BUG_ON(!vma->vm_region); @@ -589,7 +576,7 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, mm->map_count++; /* add the VMA to the tree */ - vma_mas_store(vma, mas); + vma_iter_store(vmi, vma); } /* @@ -600,14 +587,14 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, */ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + VMA_ITERATOR(vmi, mm, vma->vm_start); - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; } - mas_add_vma_to_mm(&mas, mm, vma); + vmi_add_vma_to_mm(&vmi, mm, vma); return 0; } @@ -626,14 +613,15 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } } + /* * delete a VMA from its owning mm_struct and address space */ static int delete_vma_from_mm(struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -641,10 +629,9 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ - vma_mas_remove(vma, &mas); + vma_iter_clear(&vmi, vma->vm_start, vma->vm_end); return 0; } - /* * destroy a VMA record */ @@ -675,9 +662,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - return mas_walk(&mas); + return vma_iter_load(&vmi); } EXPORT_SYMBOL(find_vma); @@ -709,9 +696,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return NULL; if (vma->vm_start != addr) @@ -1062,7 +1049,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); *populate = 0; @@ -1091,8 +1078,8 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, GFP_KERNEL)) - goto error_maple_preallocate; + if (vma_iter_prealloc(&vmi)) + goto error_vma_iter_prealloc; region->vm_usage = 1; region->vm_flags = vm_flags; @@ -1234,7 +1221,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - mas_add_vma_to_mm(&mas, current->mm, vma); + vmi_add_vma_to_mm(&vmi, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1250,7 +1237,7 @@ unsigned long do_mmap(struct file *file, error_just_free: up_write(&nommu_region_sem); error: - mas_destroy(&mas); + vma_iter_free(&vmi); if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); @@ -1278,7 +1265,7 @@ unsigned long do_mmap(struct file *file, show_free_areas(0, NULL); return -ENOMEM; -error_maple_preallocate: +error_vma_iter_prealloc: kmem_cache_free(vm_region_jar, region); vm_area_free(vma); pr_warn("Allocation of vma tree for process %d failed\n", current->pid); @@ -1344,20 +1331,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; - mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1369,10 +1354,10 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - goto err_mas_preallocate; + goto err_vmi_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1406,13 +1391,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, setup_vma_to_mm(vma, mm); setup_vma_to_mm(new, mm); - mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); - mas_store(&mas, vma); - vma_mas_store(new, &mas); + vma_iter_store(vmi, new); mm->map_count++; return 0; -err_mas_preallocate: +err_vmi_preallocate: vm_area_free(new); err_vma_dup: kmem_cache_free(vm_region_jar, region); @@ -1466,7 +1449,7 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *vma; unsigned long end; int ret = 0; @@ -1478,7 +1461,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = mas_find(&mas, end - 1); + vma = vma_find(&vmi, end); if (!vma) { static int limit; if (limit < 5) { @@ -1497,7 +1480,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = mas_next(&mas, end - 1); + vma = vma_find(&vmi, end); } while (vma); return -EINVAL; } else { @@ -1511,7 +1494,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret < 0) return ret; } From 66615a08374d548b61fc2da1249bdce28ae69c6e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:29 -0500 Subject: [PATCH 054/143] nommu: pass through vma iterator to shrink_vma() Rename the function to vmi_shrink_vma() indicate it takes the vma iterator. Use the iterator to preallocate and drop the delete function. The maple tree is able to do the modification easier than the linked list and rbtree, so just clear the necessary area in the tree. add_vma_to_mm() is no longer used, so drop this function. vmi_add_vma_to_mm() is now only used once, so inline this function into do_mmap(). Link: https://lkml.kernel.org/r/20230120162650.984577-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/nommu.c | 63 +++++++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 7a52a7c370099..9ddeb92600d6d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -560,44 +560,6 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } } -/* - * vmi_add_vma_to_mm() - VMA Iterator variant of add_vmi_to_mm(). - * @vmi: The VMA iterator - * @mm: The mm_struct - * @vma: The vma to add - * - */ -static void vmi_add_vma_to_mm(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma) -{ - BUG_ON(!vma->vm_region); - - setup_vma_to_mm(vma, mm); - mm->map_count++; - - /* add the VMA to the tree */ - vma_iter_store(vmi, vma); -} - -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) -{ - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (vma_iter_prealloc(&vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - return -ENOMEM; - } - vmi_add_vma_to_mm(&vmi, mm, vma); - return 0; -} - static void cleanup_vma_from_mm(struct vm_area_struct *vma) { vma->vm_mm->map_count--; @@ -1221,7 +1183,11 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - vmi_add_vma_to_mm(&vmi, current->mm, vma); + BUG_ON(!vma->vm_region); + setup_vma_to_mm(vma, current->mm); + current->mm->map_count++; + /* add the VMA to the tree */ + vma_iter_store(&vmi, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1406,7 +1372,7 @@ int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, * shrink a VMA by removing the specified chunk from either the beginning or * the end */ -static int shrink_vma(struct mm_struct *mm, +static int vmi_shrink_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long from, unsigned long to) { @@ -1414,14 +1380,19 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - if (delete_vma_from_mm(vma)) + if (vma_iter_prealloc(vmi)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); return -ENOMEM; - if (from > vma->vm_start) + } + + if (from > vma->vm_start) { + vma_iter_clear(vmi, from, vma->vm_end); vma->vm_end = from; - else + } else { + vma_iter_clear(vmi, vma->vm_start, to); vma->vm_start = to; - if (add_vma_to_mm(mm, vma)) - return -ENOMEM; + } /* cut the backing region down to size */ region = vma->vm_region; @@ -1498,7 +1469,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (ret < 0) return ret; } - return shrink_vma(mm, vma, start, end); + return vmi_shrink_vma(&vmi, vma, start, end); } erase_whole_vma: From 593c82c7ae1cf6db8d39c4940872b1e30e220d3e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:30 -0500 Subject: [PATCH 055/143] mm: switch vma_merge(), split_vma(), and __split_vma to vma iterator Drop the vmi_* functions and transition all users to use the vma iterator directly. Link: https://lkml.kernel.org/r/20230120162650.984577-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 14 ++++---- include/linux/mm.h | 18 +++-------- mm/madvise.c | 6 ++-- mm/mempolicy.c | 6 ++-- mm/mlock.c | 6 ++-- mm/mmap.c | 79 +++++++++++++--------------------------------- mm/mprotect.c | 6 ++-- mm/mremap.c | 10 +++--- mm/nommu.c | 8 +++-- 9 files changed, 55 insertions(+), 98 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4334bd35984d6..f3c75c6222de6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -909,7 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, + prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1452,7 +1452,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), @@ -1463,12 +1463,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } @@ -1632,7 +1632,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); @@ -1641,12 +1641,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 843e8ada86efb..4cf18a1b2450f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2845,24 +2845,16 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, { return __vma_adjust(vma, start, end, pgoff, insert, NULL); } -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); -extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/madvise.c b/mm/madvise.c index 4d4471916465f..02b317726c9aa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -150,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { @@ -163,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, start, 1); + error = __split_vma(&vmi, vma, start, 1); if (error) return error; } @@ -171,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, end, 0); + error = __split_vma(&vmi, vma, end, 0); if (error) return error; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4f627b1bf0a1b..7686f40c97500 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, + prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -819,12 +819,12 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto replace; } if (vma->vm_start != vmstart) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); + err = split_vma(&vmi, vma, vmstart, 1); if (err) goto out; } if (vma->vm_end != vmend) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); + err = split_vma(&vmi, vma, vmend, 0); if (err) goto out; } diff --git a/mm/mlock.c b/mm/mlock.c index 0d09b9070071c..0336f52e03d75 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -418,7 +418,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + *prev = vma_merge(vmi, mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { @@ -427,13 +427,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - ret = vmi_split_vma(vmi, mm, vma, start, 1); + ret = split_vma(vmi, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = vmi_split_vma(vmi, mm, vma, end, 0); + ret = split_vma(vmi, vma, end, 0); if (ret) goto out; } diff --git a/mm/mmap.c b/mm/mmap.c index 8806bfbaa5055..afc65f122f7da 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1010,7 +1010,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, @@ -1019,7 +1019,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *mid, *next, *res; + struct vm_area_struct *mid, *next, *res = NULL; int err = -1; bool merge_prev = false; bool merge_next = false; @@ -1085,26 +1085,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (err) return NULL; khugepaged_enter_vma(res, vm_flags); - return res; -} -struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, - struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - struct vm_area_struct *tmp; - - tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, - policy, vm_userfaultfd_ctx, anon_name); - if (tmp) + if (res) vma_iter_set(vmi, end); - return tmp; + return res; } /* @@ -2228,12 +2213,14 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. */ -int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - validate_mm_mt(mm); + unsigned long end = vma->vm_end; + + validate_mm_mt(vma->vm_mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2273,8 +2260,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); /* Success. */ - if (!err) + if (!err) { + vma_iter_set(vmi, end); return 0; + } /* Avoid vm accounting in close() operation */ new->vm_start = new->vm_end; @@ -2289,46 +2278,21 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); - validate_mm_mt(mm); + validate_mm_mt(vma->vm_mm); return err; } -int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = __split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; -} /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; - return __split_vma(mm, vma, addr, new_below); -} - -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; + return __split_vma(vmi, vma, addr, new_below); } static inline int munmap_sidetree(struct vm_area_struct *vma, @@ -2388,7 +2352,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = vmi__split_vma(vmi, mm, vma, start, 0); + error = __split_vma(vmi, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2373,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = vmi__split_vma(vmi, mm, next, end, 1); + error = __split_vma(vmi, next, end, 1); if (error) goto end_split_failed; @@ -2690,9 +2654,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, vma->vm_file, - vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, + vma->vm_file, vma->vm_pgoff, NULL, + NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3249,7 +3214,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 39b6335b8813b..cce6a0e58fb50 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -642,7 +642,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, + *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, *pprev = vma; if (start != vma->vm_start) { - error = vmi_split_vma(vmi, mm, vma, start, 1); + error = split_vma(vmi, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = vmi_split_vma(vmi, mm, vma, end, 0); + error = split_vma(vmi, vma, end, 0); if (error) goto fail; } diff --git a/mm/mremap.c b/mm/mremap.c index f161516ab3c1e..71ba8eddd836b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1043,12 +1043,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vmi_vma_merge(&vmi, mm, vma, - extension_start, extension_end, - vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); + vma = vma_merge(&vmi, mm, vma, extension_start, + extension_end, vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; diff --git a/mm/nommu.c b/mm/nommu.c index 9ddeb92600d6d..9a166738909ea 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1297,18 +1297,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + struct mm_struct *mm; /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; + mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1465,7 +1467,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret < 0) return ret; } From d79ac3ffed1a348955dc8edb402e3e8f9a6f7037 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:31 -0500 Subject: [PATCH 056/143] mm/damon/vaddr-test.h: stop using vma_mas_store() for maple tree store Prepare for the removal of the vma_mas_store() function by open coding the maple tree store in this test code. Set the range of the maple state and call the store function directly. Link: https://lkml.kernel.org/r/20230120162650.984577-31-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: kernel test robot Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index bce37c4875402..c4b455b5ee30b 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,19 +14,26 @@ #include -static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, +static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, ssize_t nr_vmas) { - int i; + int i, ret = -ENOMEM; MA_STATE(mas, mt, 0, 0); if (!nr_vmas) - return; + return 0; mas_lock(&mas); - for (i = 0; i < nr_vmas; i++) - vma_mas_store(&vmas[i], &mas); + for (i = 0; i < nr_vmas; i++) { + mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1); + if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL)) + goto failed; + } + + ret = 0; +failed: mas_unlock(&mas); + return ret; } /* @@ -71,7 +78,8 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) }; mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); - __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); + if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas))) + kunit_skip(test, "Failed to create VMA tree"); __damon_va_three_regions(&mm, regions); From 5afa5bf372649fc92c71793d5f1923b6c59e5bb4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:32 -0500 Subject: [PATCH 057/143] mmap: convert __vma_adjust() to use vma iterator Use the vma iterator internally for __vma_adjust(). Avoid using the maple tree interface directly for type safety. Link: https://lkml.kernel.org/r/20230120162650.984577-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 -- mm/mmap.c | 75 ++++++++-------------------------------------- 2 files changed, 13 insertions(+), 65 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4cf18a1b2450f..605d2c40e921c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2862,9 +2862,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); - static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, diff --git a/mm/mmap.c b/mm/mmap.c index afc65f122f7da..07ba54c34bd04 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -432,56 +432,6 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } -/* - * vma_mas_store() - Store a VMA in the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to store a VMA in the maple tree when the @mas has already - * walked to the correct location. - * - * Note: the end address is inclusive in the maple tree. - */ -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - trace_vma_store(mas->tree, vma); - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -/* - * vma_mas_remove() - Remove a VMA from the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to remove a VMA from the maple tree when the @mas has already - * been established and points to the correct location. - * Note: the end address is inclusive in the maple tree. - */ -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - -/* - * vma_mas_szero() - Set a given range to zero. Used when modifying a - * vm_area_struct start or end. - * - * @mas: The maple tree ma_state - * @start: The start address to zero - * @end: The end address to zero. - */ -static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, - unsigned long end) -{ - trace_vma_mas_szero(mas->tree, start, end - 1); - mas_set_range(mas, start, end - 1); - mas_store_prealloc(mas, NULL); -} - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); @@ -641,7 +591,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool vma_changed = false; long adjust_next = 0; int remove_next = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { @@ -726,7 +676,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (vma_iter_prealloc(&vmi)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -772,7 +722,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (start != vma->vm_start) { if ((vma->vm_start < start) && (!insert || (insert->vm_end != start))) { - vma_mas_szero(&mas, vma->vm_start, start); + vma_iter_clear(&vmi, vma->vm_start, start); VM_WARN_ON(insert && insert->vm_start > vma->vm_start); } else { vma_changed = true; @@ -782,8 +732,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (end != vma->vm_end) { if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { - vma_mas_szero(&mas, end, vma->vm_end); - mas_reset(&mas); + vma_iter_clear(&vmi, end, vma->vm_end); + vma_iter_set(&vmi, vma->vm_end); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); } @@ -794,13 +744,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (vma_changed) - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_mas_store(next, &mas); + vma_iter_store(&vmi, next); } if (file) { @@ -820,8 +770,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - mas_reset(&mas); - vma_mas_store(insert, &mas); + vma_iter_store(&vmi, insert); mm->map_count++; } @@ -867,7 +816,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (insert && file) uprobe_mmap(insert); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return 0; @@ -1999,7 +1948,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, vma->vm_start, address - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2081,7 +2031,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, address, vma->vm_end - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); From e4a524ae39d2d0f1ed4031f023d660f0e02c0af5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:33 -0500 Subject: [PATCH 058/143] mm: pass through vma iterator to __vma_adjust() Pass the vma iterator through to __vma_adjust() so the state can be updated. Link: https://lkml.kernel.org/r/20230120162650.984577-33-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/mmap.c | 31 +++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 605d2c40e921c..6681dd54e9d3d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2837,13 +2837,15 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, +extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, struct vm_area_struct *expand); static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { - return __vma_adjust(vma, start, end, pgoff, insert, NULL); + VMA_ITERATOR(vmi, vma->vm_mm, start); + + return __vma_adjust(&vmi, vma, start, end, pgoff, insert, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 07ba54c34bd04..330de1ab6a8d8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -576,9 +576,9 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) +int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *insert, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next_next = NULL; /* uninit var warning */ @@ -591,7 +591,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool vma_changed = false; long adjust_next = 0; int remove_next = 0; - VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { @@ -676,7 +675,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (vma_iter_prealloc(&vmi)) + if (vma_iter_prealloc(vmi)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -722,7 +721,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (start != vma->vm_start) { if ((vma->vm_start < start) && (!insert || (insert->vm_end != start))) { - vma_iter_clear(&vmi, vma->vm_start, start); + vma_iter_clear(vmi, vma->vm_start, start); VM_WARN_ON(insert && insert->vm_start > vma->vm_start); } else { vma_changed = true; @@ -732,8 +731,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (end != vma->vm_end) { if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { - vma_iter_clear(&vmi, end, vma->vm_end); - vma_iter_set(&vmi, vma->vm_end); + vma_iter_clear(vmi, end, vma->vm_end); + vma_iter_set(vmi, vma->vm_end); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); } @@ -744,13 +743,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (vma_changed) - vma_iter_store(&vmi, vma); + vma_iter_store(vmi, vma); vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_iter_store(&vmi, next); + vma_iter_store(vmi, next); } if (file) { @@ -770,7 +769,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - vma_iter_store(&vmi, insert); + vma_iter_store(vmi, insert); mm->map_count++; } @@ -816,7 +815,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (insert && file) uprobe_mmap(insert); - vma_iter_free(&vmi); + vma_iter_free(vmi); validate_mm(mm); return 0; @@ -1010,20 +1009,20 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL, prev); res = prev; } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, end, prev->vm_pgoff, NULL, prev); res = prev; } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); else /* cases 3, 8 */ - err = __vma_adjust(mid, addr, next->vm_end, + err = __vma_adjust(vmi, mid, addr, next->vm_end, next->vm_pgoff - pglen, NULL, next); res = next; } From ac452cb7adab801c3fb48d725d2ee6d68e4195f2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:34 -0500 Subject: [PATCH 059/143] madvise: use split_vma() instead of __split_vma() The split_vma() wrapper is specifically for this use case, so use it. Link: https://lkml.kernel.org/r/20230120162650.984577-34-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 02b317726c9aa..7db6622f8293e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -161,17 +161,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(&vmi, vma, start, 1); + error = split_vma(&vmi, vma, start, 1); if (error) return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(&vmi, vma, end, 0); + error = split_vma(&vmi, vma, end, 0); if (error) return error; } From 0eb1aa7f834a1fa3d042e508e29ef446727e9b74 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 25 Jan 2023 08:58:09 -0500 Subject: [PATCH 060/143] mm/madvise: fix VMA_ITERATOR start position The WARN_ON() in vma_iter_store() detected an invalid VMA iterator state. Inspecting the code stack from the report shows that the VMA iterator is never set to the correct start position. Setting the initialization of the VMA iterator to use the address 'start' fixes this issue. Link: https://lkml.kernel.org/r/20230125135809.85262-1-Liam.Howlett@oracle.com Signed-off-by: Liam Howlett Reported-by: Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 7db6622f8293e..ca672e37b38c1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,7 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - VMA_ITERATOR(vmi, mm, 0); + VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; From c5a0714a3818db052481245aa7b3535131db52c0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:35 -0500 Subject: [PATCH 061/143] mm: remove unnecessary write to vma iterator in __vma_adjust() If the vma start address is going to change due to an insert, then it is safe to not write the vma to the tree. The write of the insert vma will alter the tree as necessary. Link: https://lkml.kernel.org/r/20230120162650.984577-35-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 330de1ab6a8d8..bd3ccfb4d9e0e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -719,10 +719,12 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - if ((vma->vm_start < start) && - (!insert || (insert->vm_end != start))) { - vma_iter_clear(vmi, vma->vm_start, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + if (vma->vm_start < start) { + if (!insert || (insert->vm_end != start)) { + vma_iter_clear(vmi, vma->vm_start, start); + vma_iter_set(vmi, start); + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } } else { vma_changed = true; } From 0fa010acf573bd94906e357b199cd33106966fd4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:36 -0500 Subject: [PATCH 062/143] mm: pass vma iterator through to __vma_adjust() Pass the iterator through to be used in __vma_adjust(). The state of the iterator needs to be correct for the operation that will occur so make the adjustments. Link: https://lkml.kernel.org/r/20230120162650.984577-36-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index bd3ccfb4d9e0e..1bdf66b3b96ec 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -525,6 +525,10 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_interval_tree_remove(vma, root); } + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; @@ -2164,13 +2168,13 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. + * VMA Iterator will point to the end VMA. */ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - unsigned long end = vma->vm_end; validate_mm_mt(vma->vm_mm); @@ -2206,14 +2210,17 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_ops->open(new); if (new_below) - err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); + err = __vma_adjust(vmi, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), + new, NULL); else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = __vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, + new, NULL); /* Success. */ if (!err) { - vma_iter_set(vmi, end); + if (new_below) + vma_next(vmi); return 0; } @@ -2308,8 +2315,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto start_split_failed; - vma_iter_set(vmi, start); - vma = vma_find(vmi, end); + vma = vma_iter_load(vmi); } prev = vma_prev(vmi); @@ -2329,7 +2335,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto end_split_failed; - vma_iter_set(vmi, end); split = vma_prev(vmi); error = munmap_sidetree(split, &mas_detach); if (error) @@ -2573,6 +2578,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto unacct_error; } + vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; vma->vm_flags = vm_flags; From e836f7bd1a8539221582c9de3e99e4cd4e371721 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:37 -0500 Subject: [PATCH 063/143] mm: add vma iterator to vma_adjust() arguments Change the vma_adjust() function definition to accept the vma iterator and pass it through to __vma_adjust(). Update fs/exec to use the new vma_adjust() function parameters. Update mm/mremap to use the new vma_adjust() function parameters. Revert the __split_vma() calls back from __vma_adjust() to vma_adjust() and pass through the vma iterator. Link: https://lkml.kernel.org/r/20230120162650.984577-37-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 11 ++++------- include/linux/mm.h | 9 ++++----- mm/mmap.c | 10 +++++----- mm/mremap.c | 4 ++-- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b98647eeae9fe..76ee62e1d3f1b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -731,12 +731,9 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) } tlb_finish_mmu(&tlb); - /* - * Shrink the vma to just the new range. Always succeeds. - */ - vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); - - return 0; + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff, NULL); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 6681dd54e9d3d..7d9d0d89c34df 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2840,12 +2840,11 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +static inline int vma_adjust(struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + pgoff_t pgoff, struct vm_area_struct *insert) { - VMA_ITERATOR(vmi, vma->vm_mm, start); - - return __vma_adjust(&vmi, vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vmi, vma, start, end, pgoff, insert, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 1bdf66b3b96ec..f61e45caa32cf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2210,12 +2210,12 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_ops->open(new); if (new_below) - err = __vma_adjust(vmi, vma, addr, vma->vm_end, - vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), - new, NULL); + err = vma_adjust(vmi, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), + new); else - err = __vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, - new, NULL); + err = vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, + new); /* Success. */ if (!err) { diff --git a/mm/mremap.c b/mm/mremap.c index 71ba8eddd836b..2176f0cc7f9a1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1047,8 +1047,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + } else if (vma_adjust(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { From 3abcf6f8abe738cc07b609c7eec94068328d83a4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:38 -0500 Subject: [PATCH 064/143] mmap: clean up mmap_region() unrolling Move logic of unrolling to the error path as apposed to duplicating it within the function body. This reduces the potential of missing an update to one path when making changes. Link: https://lkml.kernel.org/r/20230120162650.984577-38-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Li Zetao Signed-off-by: Andrew Morton --- mm/mmap.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f61e45caa32cf..95ea613a43787 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2601,12 +2601,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. */ - if (WARN_ON((addr != vma->vm_start))) { - error = -EINVAL; + error = -EINVAL; + if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; - } - vma_iter_set(&vmi, addr); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. @@ -2653,25 +2652,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + error = -EINVAL; + if (!arch_validate_flags(vma->vm_flags)) + goto close_and_free_vma; - if (vma_iter_prealloc(&vmi)) { - error = -ENOMEM; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + error = -ENOMEM; + if (vma_iter_prealloc(&vmi)) + goto close_and_free_vma; if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); @@ -2730,14 +2717,18 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return addr; close_and_free_vma: - if (vma->vm_ops && vma->vm_ops->close) + if (file && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); + + if (file || vma->vm_file) { unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; + fput(vma->vm_file); + vma->vm_file = NULL; - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end); + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, + vma->vm_end); + } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: From 841d184d65276bb04bee9549a132b1fb8462db71 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:39 -0500 Subject: [PATCH 065/143] mm: change munmap splitting order and move_vma() Splitting can be more efficient when the order is not of concern. Change do_vmi_align_munmap() to reduce walking of the tree during split operations. move_vma() must also be altered to remove the dependency of keeping the original VMA as the active part of the split. Transition to using vma iterator to look up the prev and/or next vma after munmap. Link: https://lkml.kernel.org/r/20230120162650.984577-39-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 18 ++---------------- mm/mremap.c | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 95ea613a43787..29ffd58d4091f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2329,21 +2329,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { - struct vm_area_struct *split; - - error = __split_vma(vmi, next, end, 1); + error = __split_vma(vmi, next, end, 0); if (error) goto end_split_failed; - - split = vma_prev(vmi); - error = munmap_sidetree(split, &mas_detach); - if (error) - goto munmap_sidetree_failed; - - count++; - if (vma == next) - vma = split; - break; } error = munmap_sidetree(next, &mas_detach); if (error) @@ -2356,9 +2344,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #endif } - if (!next) - next = vma_next(vmi); - + next = vma_next(vmi); if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas diff --git a/mm/mremap.c b/mm/mremap.c index 2176f0cc7f9a1..1bc81afd90de8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -580,11 +580,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long vm_flags = vma->vm_flags; unsigned long new_pgoff; unsigned long moved_len; - unsigned long excess = 0; + unsigned long account_start = 0; + unsigned long account_end = 0; unsigned long hiwater_vm; - int split = 0; int err = 0; bool need_rmap_locks; + VMA_ITERATOR(vmi, mm, old_addr); /* * We'd prefer to avoid failure later on in do_munmap: @@ -662,10 +663,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; - excess = vma->vm_end - vma->vm_start - old_len; - if (old_addr > vma->vm_start && - old_addr + old_len < vma->vm_end) - split = 1; + if (vma->vm_start < old_addr) + account_start = vma->vm_start; + if (vma->vm_end > old_addr + old_len) + account_end = vma->vm_end; } /* @@ -700,11 +701,11 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } - if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { + if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(old_len >> PAGE_SHIFT); - excess = 0; + account_start = account_end = 0; } if (vm_flags & VM_LOCKED) { @@ -715,10 +716,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (excess) { + if (account_start) { + vma = vma_prev(&vmi); + vma->vm_flags |= VM_ACCOUNT; + } + + if (account_end) { + vma = vma_next(&vmi); vma->vm_flags |= VM_ACCOUNT; - if (split) - find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; From 9212e8478f1956b3ad0fb52bf42dd0fffeaf4e6b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 26 Jan 2023 16:20:11 -0500 Subject: [PATCH 066/143] mm/mremap: fix vma iterator initialization The vma iterator location is incorrect when there is a failure in the move_vma() function which alters the address being modified. Delay the initialization of the vma iterator until the address is stable. Link: https://lkml.kernel.org/r/20230126212011.980350-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Sanan Hasanov Link: https://lore.kernel.org/linux-mm/IA1PR07MB98306BC0F55667A760EABE91ABCE9@IA1PR07MB9830.namprd07.prod.outlook.com/ Signed-off-by: Andrew Morton --- mm/mremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 1bc81afd90de8..6c7f49ab7d197 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -585,7 +585,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int err = 0; bool need_rmap_locks; - VMA_ITERATOR(vmi, mm, old_addr); + struct vma_iterator vmi; /* * We'd prefer to avoid failure later on in do_munmap: @@ -701,6 +701,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } + vma_iter_init(&vmi, mm, old_addr); if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) From ca8a3dd26ba3b1e649c442ad9c0836920aab1e6a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:40 -0500 Subject: [PATCH 067/143] mm/mmap: move anon_vma setting in __vma_adjust() Move the anon_vma setting & warn_no up the function. This is done to clear up the locking later. Link: https://lkml.kernel.org/r/20230120162650.984577-40-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 29ffd58d4091f..12545ec9cdeb1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -682,6 +682,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) return -ENOMEM; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + + if (anon_vma) + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -703,12 +711,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, } } - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; if (anon_vma) { - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) From a8ce32c63baec17ff2968e8e34d583313d5c4d75 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:41 -0500 Subject: [PATCH 068/143] mm/mmap: refactor locking out of __vma_adjust() Move the locking into vma_prepare() and vma_complete() for use elsewhere Link: https://lkml.kernel.org/r/20230120162650.984577-41-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 14 +++ mm/mmap.c | 231 +++++++++++++++++++++++++++++--------------------- 2 files changed, 150 insertions(+), 95 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index ffd65248f266c..90bb2078444c1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -941,4 +941,18 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 12545ec9cdeb1..1ef284b7e6fbe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -573,6 +573,127 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; } +/* + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct + */ +static inline void vma_prepare(struct vma_prepare *vp) +{ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + +} + +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static inline void vma_complete(struct vma_prepare *vp, + struct vma_iterator *vmi, struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->file, + vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } + + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } + + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } + + if (vp->remove) { +again: + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove next_next too. + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -588,14 +709,13 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *next_next = NULL; /* uninit var warning */ struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; int remove_next = 0; struct vm_area_struct *exporter = NULL, *importer = NULL; + struct vma_prepare vma_prep; if (next && !insert) { if (end >= next->vm_end) { @@ -691,39 +811,22 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, anon_vma != next->anon_vma); vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - - if (adjust_next) - uprobe_munmap(next, next->vm_start, next->vm_end); - - i_mmap_lock_write(mapping); - if (insert && insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(insert, insert->vm_file->f_mapping); - } - } - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_pre_update_vma(next); + memset(&vma_prep, 0, sizeof(vma_prep)); + vma_prep.vma = vma; + vma_prep.anon_vma = anon_vma; + vma_prep.file = file; + if (adjust_next) + vma_prep.adj_next = next; + if (file) + vma_prep.mapping = file->f_mapping; + vma_prep.insert = insert; + if (remove_next) { + vma_prep.remove = next; + vma_prep.remove2 = next_next; } - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - if (adjust_next) - vma_interval_tree_remove(next, root); - } + vma_prepare(&vma_prep); if (start != vma->vm_start) { if (vma->vm_start < start) { @@ -761,69 +864,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, next); } - if (file) { - if (adjust_next) - vma_interval_tree_insert(next, root); - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } else if (insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - vma_iter_store(vmi, insert); - mm->map_count++; - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - - if (adjust_next) - uprobe_mmap(next); - } - - if (remove_next) { -again: - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - if (remove_next != 2) - BUG_ON(vma->vm_end < next->vm_end); - vm_area_free(next); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. - */ - if (remove_next == 2) { - remove_next = 1; - next = next_next; - goto again; - } - } - if (insert && file) - uprobe_mmap(insert); - + vma_complete(&vma_prep, vmi, mm); vma_iter_free(vmi); validate_mm(mm); From 91ea022153ea2c29b2a30da700362c641be8a5b9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:42 -0500 Subject: [PATCH 069/143] mm/mmap: use vma_prepare() and vma_complete() in vma_expand() Use the new locking functions for vma_expand(). This reduces code duplication. At the same time change VM_BUG_ON() to VM_WARN_ON() Link: https://lkml.kernel.org/r/20230120162650.984577-42-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 188 +++++++++++++++++++++--------------------------------- 1 file changed, 72 insertions(+), 116 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1ef284b7e6fbe..b5bf70db37779 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -457,122 +457,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } -/* - * vma_expand - Expand an existing VMA - * - * @mas: The maple state - * @vma: The vma to expand - * @start: The start of the vma - * @end: The exclusive end of the vma - * @pgoff: The page offset of vma - * @next: The current of next vma. - * - * Expand @vma to @start and @end. Can expand off the start and end. Will - * expand over @next if it's different from @vma and @end == @next->vm_end. - * Checking if the @vma can expand and merge with @next needs to be handled by - * the caller. - * - * Returns: 0 on success - */ -inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) -{ - struct mm_struct *mm = vma->vm_mm; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; - struct anon_vma *anon_vma = vma->anon_vma; - struct file *file = vma->vm_file; - bool remove_next = false; - - if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; - - anon_vma = next->anon_vma; - vma->anon_vma = anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } - } - - /* Not merging but overwriting any part of next is not handled. */ - VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); - /* Only handles expanding */ - VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - - if (vma_iter_prealloc(vmi)) - goto nomem; - - vma_adjust_trans_huge(vma, start, end, 0); - - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - i_mmap_lock_write(mapping); - } - - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } - - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - } - - /* VMA iterator points to previous, so set to start if necessary */ - if (vma_iter_addr(vmi) != start) - vma_iter_set(vmi, start); - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - vma_iter_store(vmi, vma); - - if (file) { - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - /* Expanding over the next vma */ - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - } - - if (remove_next) { - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - vm_area_free(next); - } - - validate_mm(mm); - return 0; - -nomem: - return -ENOMEM; -} - /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -694,6 +578,78 @@ static inline void vma_complete(struct vma_prepare *vp, uprobe_mmap(vp->insert); } +/* + * vma_expand - Expand an existing VMA + * + * @vmi: The vma iterator + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct vma_prepare vp; + + memset(&vp, 0, sizeof(vp)); + vp.vma = vma; + vp.anon_vma = vma->anon_vma; + if (next && (vma != next) && (end == next->vm_end)) { + vp.remove = next; + if (next->anon_vma && !vma->anon_vma) { + int error; + + vp.anon_vma = next->anon_vma; + vma->anon_vma = next->anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_WARN_ON(next && !vp.remove && + next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + + if (vma_iter_prealloc(vmi)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + vp.file = vma->vm_file; + if (vp.file) + vp.mapping = vp.file->f_mapping; + + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + + vma_prepare(&vp); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_iter_store(vmi, vma); + + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; + +nomem: + return -ENOMEM; +} /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. From f4dcb8936c49accf97a0ba7f96ed6ef056530543 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:43 -0500 Subject: [PATCH 070/143] mm/mmap: introduce init_vma_prep() and init_multi_vma_prep() Add init_vma_prep() and init_multi_vma_prep() to set up the struct vma_prepare. This is to abstract the locking when adjusting the VMAs. Also change __vma_adjust() variable remove_next int in favour of a pointer to the VMA to remove. Rename next_next to remove2 since this better reflects its use. Link: https://lkml.kernel.org/r/20230120162650.984577-43-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 108 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b5bf70db37779..e259b33fcb525 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -457,6 +457,45 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } +/* + * init_multi_vma_prep() - Initializer for struct vma_prepare + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + * @next: The next vma if it is to be adjusted + * @remove: The first vma to be removed + * @remove2: The second vma to be removed + */ +static inline void init_multi_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma, struct vm_area_struct *next, + struct vm_area_struct *remove, struct vm_area_struct *remove2) +{ + memset(vp, 0, sizeof(struct vma_prepare)); + vp->vma = vma; + vp->anon_vma = vma->anon_vma; + vp->remove = remove; + vp->remove2 = remove2; + vp->adj_next = next; + if (!vp->anon_vma && next) + vp->anon_vma = next->anon_vma; + + vp->file = vma->vm_file; + if (vp->file) + vp->mapping = vma->vm_file->f_mapping; + +} + +/* + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + */ +static inline void init_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma) +{ + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); +} + + /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -566,7 +605,7 @@ static inline void vma_complete(struct vma_prepare *vp, /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. + * we must remove the one after next as well. */ if (vp->remove2) { vp->remove = vp->remove2; @@ -599,17 +638,14 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { + bool remove_next = false; struct vma_prepare vp; - memset(&vp, 0, sizeof(vp)); - vp.vma = vma; - vp.anon_vma = vma->anon_vma; if (next && (vma != next) && (end == next->vm_end)) { - vp.remove = next; + remove_next = true; if (next->anon_vma && !vma->anon_vma) { int error; - vp.anon_vma = next->anon_vma; vma->anon_vma = next->anon_vma; error = anon_vma_clone(vma, next); if (error) @@ -617,6 +653,7 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, } } + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON(next && !vp.remove && next != vma && end > next->vm_start); @@ -627,11 +664,6 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, goto nomem; vma_adjust_trans_huge(vma, start, end, 0); - - vp.file = vma->vm_file; - if (vp.file) - vp.mapping = vp.file->f_mapping; - /* VMA iterator points to previous, so set to start if necessary */ if (vma_iter_addr(vmi) != start) vma_iter_set(vmi, start); @@ -662,14 +694,13 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *insert, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *remove2 = NULL; + struct vm_area_struct *remove = NULL; struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; - struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; - int remove_next = 0; struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; @@ -688,25 +719,24 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, */ VM_WARN_ON(end != next->vm_end); /* - * remove_next == 3 means we're - * removing "vma" and that to do so we + * we're removing "vma" and that to do so we * swapped "vma" and "next". */ - remove_next = 3; VM_WARN_ON(file != next->vm_file); swap(vma, next); + remove = next; } else { VM_WARN_ON(expand != vma); /* - * case 1, 6, 7, remove_next == 2 is case 6, - * remove_next == 1 is case 1 or 7. + * case 1, 6, 7, remove next. + * case 6 also removes the one beyond next */ - remove_next = 1 + (end > next->vm_end); - if (remove_next == 2) - next_next = find_vma(mm, next->vm_end); + remove = next; + if (end > next->vm_end) + remove2 = find_vma(mm, next->vm_end); - VM_WARN_ON(remove_next == 2 && - end != next_next->vm_end); + VM_WARN_ON(remove2 != NULL && + end != remove2->vm_end); } exporter = next; @@ -716,8 +746,8 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && !next->anon_vma) - exporter = next_next; + if (remove2 != NULL && !next->anon_vma) + exporter = remove2; } else if (end > next->vm_start) { /* @@ -758,30 +788,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) return -ENOMEM; - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; - - if (anon_vma) - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - memset(&vma_prep, 0, sizeof(vma_prep)); - vma_prep.vma = vma; - vma_prep.anon_vma = anon_vma; - vma_prep.file = file; - if (adjust_next) - vma_prep.adj_next = next; - if (file) - vma_prep.mapping = file->f_mapping; - vma_prep.insert = insert; - if (remove_next) { - vma_prep.remove = next; - vma_prep.remove2 = next_next; - } + init_multi_vma_prep(&vma_prep, vma, adjust_next ? next : NULL, remove, + remove2); + VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && + vma_prep.anon_vma != next->anon_vma); + vma_prep.insert = insert; vma_prepare(&vma_prep); if (start != vma->vm_start) { From 2422c2a42d83d60fbc99fb62a45a2cf52d74e947 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:44 -0500 Subject: [PATCH 071/143] mm: don't use __vma_adjust() in __split_vma() Use the abstracted locking and maple tree operations. Since __split_vma() is the only user of the __vma_adjust() function to use the insert argument, drop that argument. Remove the NULL passed through from fs/exec's shift_arg_pages() and mremap() at the same time. Link: https://lkml.kernel.org/r/20230120162650.984577-44-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 4 +- include/linux/mm.h | 7 ++- mm/mmap.c | 118 +++++++++++++++++++++------------------------ mm/mremap.c | 2 +- 4 files changed, 61 insertions(+), 70 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 76ee62e1d3f1b..d52fca2dd30b4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff)) return -ENOMEM; /* @@ -733,7 +733,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) vma_prev(&vmi); /* Shrink the vma to just the new range */ - return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff, NULL); + return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d9d0d89c34df..f41ff4e7daaca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2838,13 +2838,12 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); + unsigned long end, pgoff_t pgoff, struct vm_area_struct *expand); static inline int vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, - pgoff_t pgoff, struct vm_area_struct *insert) + pgoff_t pgoff) { - return __vma_adjust(vmi, vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vmi, vma, start, end, pgoff, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index e259b33fcb525..3d08d7d243f05 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -691,7 +691,7 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, */ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *insert, struct vm_area_struct *expand) + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *remove2 = NULL; @@ -704,7 +704,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; - if (next && !insert) { + if (next) { if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -795,39 +795,25 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && vma_prep.anon_vma != next->anon_vma); - vma_prep.insert = insert; vma_prepare(&vma_prep); - if (start != vma->vm_start) { - if (vma->vm_start < start) { - if (!insert || (insert->vm_end != start)) { - vma_iter_clear(vmi, vma->vm_start, start); - vma_iter_set(vmi, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); - } - } else { - vma_changed = true; - } - vma->vm_start = start; - } - if (end != vma->vm_end) { - if (vma->vm_end > end) { - if (!insert || (insert->vm_start != end)) { - vma_iter_clear(vmi, end, vma->vm_end); - vma_iter_set(vmi, vma->vm_end); - VM_WARN_ON(insert && - insert->vm_end < vma->vm_end); - } - } else { - vma_changed = true; - } - vma->vm_end = end; - } + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); + else if (start != vma->vm_start) + vma_changed = true; + + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + else if (end != vma->vm_end) + vma_changed = true; + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; if (vma_changed) vma_iter_store(vmi, vma); - vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; @@ -846,9 +832,9 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1030,20 +1016,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ err = __vma_adjust(vmi, prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); + next->vm_end, prev->vm_pgoff, prev); res = prev; } else if (merge_prev) { /* cases 2, 5, 7 */ err = __vma_adjust(vmi, prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); + end, prev->vm_pgoff, prev); res = prev; } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(vmi, prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); + addr, prev->vm_pgoff, next); else /* cases 3, 8 */ err = __vma_adjust(vmi, mid, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); + next->vm_pgoff - pglen, next); res = next; } @@ -2187,11 +2172,15 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { + struct vma_prepare vp; struct vm_area_struct *new; int err; validate_mm_mt(vma->vm_mm); + WARN_ON(vma->vm_start >= addr); + WARN_ON(vma->vm_end <= addr); + if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) @@ -2202,16 +2191,20 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) return -ENOMEM; - if (new_below) + err = -ENOMEM; + if (vma_iter_prealloc(vmi)) + goto out_free_vma; + + if (new_below) { new->vm_end = addr; - else { + } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = vma_dup_policy(vma, new); if (err) - goto out_free_vma; + goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) @@ -2223,33 +2216,32 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) - err = vma_adjust(vmi, vma, addr, vma->vm_end, - vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), - new); - else - err = vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, - new); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); - /* Success. */ - if (!err) { - if (new_below) - vma_next(vmi); - return 0; + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; + } else { + vma->vm_end = addr; } - /* Avoid vm accounting in close() operation */ - new->vm_start = new->vm_end; - new->vm_pgoff = 0; - /* Clean everything up if vma_adjust failed. */ - if (new->vm_ops && new->vm_ops->close) - new->vm_ops->close(new); - if (new->vm_file) - fput(new->vm_file); - unlink_anon_vmas(new); - out_free_mpol: + /* vma_complete stores the new vma */ + vma_complete(&vp, vmi, vma->vm_mm); + + /* Success. */ + if (new_below) + vma_next(vmi); + validate_mm_mt(vma->vm_mm); + return 0; + +out_free_mpol: mpol_put(vma_policy(new)); - out_free_vma: +out_free_vmi: + vma_iter_free(vmi); +out_free_vma: vm_area_free(new); validate_mm_mt(vma->vm_mm); return err; diff --git a/mm/mremap.c b/mm/mremap.c index 6c7f49ab7d197..b98185f48ba5f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1054,7 +1054,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(&vmi, vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL)) { + addr + new_len, vma->vm_pgoff)) { vma = NULL; } if (!vma) { From 6d9d3c415f27a28412bca10964fef9170a3440b1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:45 -0500 Subject: [PATCH 072/143] mm/mremap: convert vma_adjust() to vma_expand() Stop using vma_adjust() in preparation for removing the function. Export vma_expand() to use instead. Link: https://lkml.kernel.org/r/20230120162650.984577-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ mm/mmap.c | 6 +++--- mm/mremap.c | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f41ff4e7daaca..cea399a0b25ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2845,6 +2845,9 @@ static inline int vma_adjust(struct vma_iterator *vmi, { return __vma_adjust(vmi, vma, start, end, pgoff, NULL); } +extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, diff --git a/mm/mmap.c b/mm/mmap.c index 3d08d7d243f05..9599db011b184 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -634,9 +634,9 @@ static inline void vma_complete(struct vma_prepare *vp, * * Returns: 0 on success */ -inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) { bool remove_next = false; struct vma_prepare vp; diff --git a/mm/mremap.c b/mm/mremap.c index b98185f48ba5f..5c9a579098628 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1053,8 +1053,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(&vmi, vma, vma->vm_start, - addr + new_len, vma->vm_pgoff)) { + } else if (vma_expand(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { From c6c21cfbc30e02c4e4de82aa653128c2148729fb Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:46 -0500 Subject: [PATCH 073/143] mm/mmap: don't use __vma_adjust() in shift_arg_pages() Introduce shrink_vma() which uses the vma_prepare() and vma_complete() functions to reduce the vma coverage. Convert shift_arg_pages() to use expand_vma() and the new shrink_vma() function. Remove support from __vma_adjust() to reduce a vma size since shift_arg_pages() is the only user that shrinks a VMA in this way. Link: https://lkml.kernel.org/r/20230120162650.984577-46-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 4 ++-- include/linux/mm.h | 10 ++------- mm/mmap.c | 52 ++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index d52fca2dd30b4..c0df813d2b455 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff)) + if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -733,7 +733,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) vma_prev(&vmi); /* Shrink the vma to just the new range */ - return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff); + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index cea399a0b25ef..a87cfab250868 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2837,17 +2837,11 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *expand); -static inline int vma_adjust(struct vma_iterator *vmi, - struct vm_area_struct *vma, unsigned long start, unsigned long end, - pgoff_t pgoff) -{ - return __vma_adjust(vmi, vma, start, end, pgoff, NULL); -} extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next); +extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, diff --git a/mm/mmap.c b/mm/mmap.c index 9599db011b184..07b52acfd5658 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -682,6 +682,44 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, nomem: return -ENOMEM; } + +/* + * vma_shrink() - Reduce an existing VMAs memory area + * @vmi: The vma iterator + * @vma: The VMA to modify + * @start: The new start + * @end: The new end + * + * Returns: 0 on success, -ENOMEM otherwise + */ +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff) +{ + struct vma_prepare vp; + + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); + + if (vma_iter_prealloc(vmi)) + return -ENOMEM; + + init_vma_prep(&vp, vma); + vma_adjust_trans_huge(vma, start, end, 0); + vma_prepare(&vp); + + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); + + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -797,14 +835,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vma_prep); - if (vma->vm_start < start) - vma_iter_clear(vmi, vma->vm_start, start); - else if (start != vma->vm_start) - vma_changed = true; - - if (vma->vm_end > end) - vma_iter_clear(vmi, end, vma->vm_end); - else if (end != vma->vm_end) + if (start < vma->vm_start || end > vma->vm_end) vma_changed = true; vma->vm_start = start; @@ -817,7 +848,10 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_iter_store(vmi, next); + if (adjust_next < 0) { + WARN_ON_ONCE(vma_changed); + vma_iter_store(vmi, next); + } } vma_complete(&vma_prep, vmi, mm); From d2ce8f52d9fcf62b1b2f361bea4f1f29f39babf3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:47 -0500 Subject: [PATCH 074/143] mm/mmap: introduce dup_vma_anon() helper Create a helper for duplicating the anon vma when adjusting the vma. This simplifies the logic of __vma_adjust(). Link: https://lkml.kernel.org/r/20230120162650.984577-47-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 74 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 07b52acfd5658..265c4605dad2c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -617,6 +617,29 @@ static inline void vma_complete(struct vma_prepare *vp, uprobe_mmap(vp->insert); } +/* + * dup_anon_vma() - Helper function to duplicate anon_vma + * @dst: The destination VMA + * @src: The source VMA + * + * Returns: 0 on success. + */ +static inline int dup_anon_vma(struct vm_area_struct *dst, + struct vm_area_struct *src) +{ + /* + * Easily overlooked: when mprotect shifts the boundary, make sure the + * expanding vma has anon_vma set if the shrinking vma had, to cover any + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { + dst->anon_vma = src->anon_vma; + return anon_vma_clone(dst, src); + } + + return 0; +} + /* * vma_expand - Expand an existing VMA * @@ -642,15 +665,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vma_prepare vp; if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; + int ret; - vma->anon_vma = next->anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } + remove_next = true; + ret = dup_anon_vma(vma, next); + if (ret) + return ret; } init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); @@ -739,10 +759,11 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; - struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; if (next) { + int error = 0; + if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -777,15 +798,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, end != remove2->vm_end); } - exporter = next; - importer = vma; - /* * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove2 != NULL && !next->anon_vma) - exporter = remove2; + if (remove != NULL && !next->anon_vma) + error = dup_anon_vma(vma, remove2); + else + error = dup_anon_vma(vma, remove); } else if (end > next->vm_start) { /* @@ -793,9 +813,8 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * mprotect case 5 shifting the boundary up. */ adjust_next = (end - next->vm_start); - exporter = next; - importer = vma; - VM_WARN_ON(expand != importer); + VM_WARN_ON(expand != vma); + error = dup_anon_vma(vma, next); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -803,24 +822,11 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * mprotect case 4 shifting the boundary down. */ adjust_next = -(vma->vm_end - end); - exporter = vma; - importer = next; - VM_WARN_ON(expand != importer); - } - - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (exporter && exporter->anon_vma && !importer->anon_vma) { - int error; - - importer->anon_vma = exporter->anon_vma; - error = anon_vma_clone(importer, exporter); - if (error) - return error; + VM_WARN_ON(expand != next); + error = dup_anon_vma(next, vma); } + if (error) + return error; } if (vma_iter_prealloc(vmi)) From d9902c310f4de0ea2bf5e2dc79d83c806bef7591 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:48 -0500 Subject: [PATCH 075/143] mm/mmap: convert do_brk_flags() to use vma_prepare() and vma_complete() Use the abstracted vma locking for do_brk_flags() Link: https://lkml.kernel.org/r/20230120162650.984577-48-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 265c4605dad2c..604ba8293a95e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2936,6 +2936,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vma_prepare vp; validate_mm_mt(mm); /* @@ -2963,18 +2964,13 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - if (vma->anon_vma) { - anon_vma_lock_write(vma->anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } + init_vma_prep(&vp, vma); + vma_prepare(&vp); vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; vma_iter_store(vmi, vma); - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } + vma_complete(&vp, vmi, mm); khugepaged_enter_vma(vma, flags); goto out; } From 4c7d57f878b3b12792c521dde6c30f3d6fbc4f71 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:49 -0500 Subject: [PATCH 076/143] mm/mmap: remove __vma_adjust() Inline the work of __vma_adjust() into vma_merge(). This reduces code size and has the added benefits of the comments for the cases being located with the code. Change the comments referencing vma_adjust() accordingly. Link: https://lkml.kernel.org/r/20230120162650.984577-49-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 250 ++++++++++++++++------------------------ mm/rmap.c | 15 +-- 4 files changed, 107 insertions(+), 162 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1a3904e0179c7..59887c69d54c1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/mm/filemap.c b/mm/filemap.c index c915ded191f03..992554c18f1f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -97,7 +97,7 @@ * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem - * ->anon_vma.lock (vma_adjust) + * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) diff --git a/mm/mmap.c b/mm/mmap.c index 604ba8293a95e..fb48cf14cb4ad 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -740,133 +740,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } -/* - * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that - * is already present in an i_mmap tree without adjusting the tree. - * The following helper function should be used when such adjustments - * are necessary. The "insert" vma (if any) is to be inserted - * before we drop the necessary locks. - */ -int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *expand) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *remove2 = NULL; - struct vm_area_struct *remove = NULL; - struct vm_area_struct *next = find_vma(mm, vma->vm_end); - struct vm_area_struct *orig_vma = vma; - struct file *file = vma->vm_file; - bool vma_changed = false; - long adjust_next = 0; - struct vma_prepare vma_prep; - - if (next) { - int error = 0; - - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and - * perhaps the one after too (mprotect case 6). - * The only other cases that gets here are - * case 1, case 7 and case 8. - */ - if (next == expand) { - /* - * The only case where we don't expand "vma" - * and we expand "next" instead is case 8. - */ - VM_WARN_ON(end != next->vm_end); - /* - * we're removing "vma" and that to do so we - * swapped "vma" and "next". - */ - VM_WARN_ON(file != next->vm_file); - swap(vma, next); - remove = next; - } else { - VM_WARN_ON(expand != vma); - /* - * case 1, 6, 7, remove next. - * case 6 also removes the one beyond next - */ - remove = next; - if (end > next->vm_end) - remove2 = find_vma(mm, next->vm_end); - - VM_WARN_ON(remove2 != NULL && - end != remove2->vm_end); - } - - /* - * If next doesn't have anon_vma, import from vma after - * next, if the vma overlaps with it. - */ - if (remove != NULL && !next->anon_vma) - error = dup_anon_vma(vma, remove2); - else - error = dup_anon_vma(vma, remove); - - } else if (end > next->vm_start) { - /* - * vma expands, overlapping part of the next: - * mprotect case 5 shifting the boundary up. - */ - adjust_next = (end - next->vm_start); - VM_WARN_ON(expand != vma); - error = dup_anon_vma(vma, next); - } else if (end < vma->vm_end) { - /* - * vma shrinks, and !insert tells it's not - * split_vma inserting another: so it must be - * mprotect case 4 shifting the boundary down. - */ - adjust_next = -(vma->vm_end - end); - VM_WARN_ON(expand != next); - error = dup_anon_vma(next, vma); - } - if (error) - return error; - } - - if (vma_iter_prealloc(vmi)) - return -ENOMEM; - - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - - init_multi_vma_prep(&vma_prep, vma, adjust_next ? next : NULL, remove, - remove2); - VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && - vma_prep.anon_vma != next->anon_vma); - - vma_prepare(&vma_prep); - - if (start < vma->vm_start || end > vma->vm_end) - vma_changed = true; - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - - if (vma_changed) - vma_iter_store(vmi, vma); - - if (adjust_next) { - next->vm_start += adjust_next; - next->vm_pgoff += adjust_next >> PAGE_SHIFT; - if (adjust_next < 0) { - WARN_ON_ONCE(vma_changed); - vma_iter_store(vmi, next); - } - } - - vma_complete(&vma_prep, vmi, mm); - vma_iter_free(vmi); - validate_mm(mm); - - return 0; -} - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -993,7 +866,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_adjust drops the + * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must @@ -1003,6 +876,12 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * or other rmap walkers (if working on addresses beyond the "end" * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. + * + * In the code below: + * PPPP is represented by *prev + * NNNN is represented by *mid (and possibly equal to *next) + * XXXX is represented by *next or not represented at all. + * AAAA is not represented - it will be merged or the function will return NULL */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -1013,11 +892,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + pgoff_t vma_pgoff; struct vm_area_struct *mid, *next, *res = NULL; + struct vm_area_struct *vma, *adjust, *remove, *remove2; int err = -1; bool merge_prev = false; bool merge_next = false; + bool vma_expanded = false; + struct vma_prepare vp; + unsigned long vma_end = end; + long adj_next = 0; + unsigned long vma_start = addr; + validate_mm(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1035,13 +922,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* Can we merge the predecessor? */ - if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && - can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff, - vm_userfaultfd_ctx, anon_name)) { - merge_prev = true; + if (prev) { + res = prev; + vma = prev; + vma_start = prev->vm_start; + vma_pgoff = prev->vm_pgoff; + /* Can we merge the predecessor? */ + if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) + && can_vma_merge_after(prev, vm_flags, anon_vma, file, + pgoff, vm_userfaultfd_ctx, anon_name)) { + merge_prev = true; + } } /* Can we merge the successor? */ if (next && end == next->vm_start && @@ -1051,32 +942,85 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vm_userfaultfd_ctx, anon_name)) { merge_next = true; } + + remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, prev); - res = prev; - } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - end, prev->vm_pgoff, prev); - res = prev; + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { + remove = mid; /* case 1 */ + vma_end = next->vm_end; + err = dup_anon_vma(res, remove); + if (mid != next) { /* case 6 */ + remove2 = next; + if (!remove->anon_vma) + err = dup_anon_vma(res, remove2); + } + } else if (merge_prev) { + err = 0; /* case 2 */ + if (mid && end > mid->vm_start) { + err = dup_anon_vma(res, mid); + if (end == mid->vm_end) { /* case 7 */ + remove = mid; + } else { /* case 5 */ + adjust = mid; + adj_next = (end - mid->vm_start); + } + } } else if (merge_next) { - if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - addr, prev->vm_pgoff, next); - else /* cases 3, 8 */ - err = __vma_adjust(vmi, mid, addr, next->vm_end, - next->vm_pgoff - pglen, next); res = next; + if (prev && addr < prev->vm_end) { /* case 4 */ + vma_end = addr; + adjust = mid; + adj_next = -(vma->vm_end - addr); + err = dup_anon_vma(res, adjust); + } else { + vma = next; /* case 3 */ + vma_start = addr; + vma_end = next->vm_end; + vma_pgoff = next->vm_pgoff; + err = 0; + if (mid != next) { /* case 8 */ + remove = mid; + err = dup_anon_vma(res, remove); + } + } } - /* - * Cannot merge with predecessor or successor or error in __vma_adjust? - */ + /* Cannot merge or error in anon_vma clone */ if (err) return NULL; + + if (vma_iter_prealloc(vmi)) + return NULL; + + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + vma_prepare(&vp); + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + + vma->vm_start = vma_start; + vma->vm_end = vma_end; + vma->vm_pgoff = vma_pgoff; + + if (vma_expanded) + vma_iter_store(vmi, vma); + + if (adj_next) { + adjust->vm_start += adj_next; + adjust->vm_pgoff += adj_next >> PAGE_SHIFT; + if (adj_next < 0) { + WARN_ON(vma_expanded); + vma_iter_store(vmi, next); + } + } + + vma_complete(&vp, vmi, mm); + vma_iter_free(vmi); + validate_mm(mm); khugepaged_enter_vma(res, vm_flags); if (res) diff --git a/mm/rmap.c b/mm/rmap.c index e143ab8ed5e22..8287f2cc327d3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -262,11 +262,12 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and - * anon_vma_fork(). The first three want an exact copy of src, while the last - * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent - * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, - * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), + * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, + * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to + * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before + * call, we can identify this case by checking (!dst->anon_vma && + * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. @@ -1253,7 +1254,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (likely(!folio_test_ksm(folio))) { - /* address might be in next vma when migration races vma_adjust */ + /* address might be in next vma when migration races vma_merge */ if (first) __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); @@ -2524,7 +2525,7 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); - /* address might be in next vma when migration races vma_adjust */ + /* address might be in next vma when migration races vma_merge */ first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); From 3c202d5cb9382902da99bfd0df90fff2ea416a8a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 30 Jan 2023 14:57:13 -0500 Subject: [PATCH 077/143] mm/mmap: fix vma_merge() offset when expanding the next vma The vm_pgoff was being set incorrectly when expanding the next VMA to a lower address. Fix the issue by using the mid->vm_pgoff value for this merge case (aka case 8). Note that this does not change case 3's vm_pgoff as next and mid are the same VMA. Link: https://lkml.kernel.org/r/20230130195713.2881766-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Sanan Hasanov Link: https://lore.kernel.org/linux-mm/IA1PR07MB983017D2FBA174D2FF78CEB6ABCE9@IA1PR07MB9830.namprd07.prod.outlook.com/ Acked-by: David Hildenbrand Cc: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index fb48cf14cb4ad..8ce4cee42dce1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -977,7 +977,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma = next; /* case 3 */ vma_start = addr; vma_end = next->vm_end; - vma_pgoff = next->vm_pgoff; + vma_pgoff = mid->vm_pgoff; err = 0; if (mid != next) { /* case 8 */ remove = mid; From dfc235d633eaa2adf47b4b2f260149381836410c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:50 -0500 Subject: [PATCH 078/143] vma_merge: set vma iterator to correct position. When merging the previous value, set the vma iterator to the previous slot. Don't use the vma iterator to get the next/prev so that it is in the correct position for a write. Link: https://lkml.kernel.org/r/20230120162650.984577-50-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 8ce4cee42dce1..b698a96d05116 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -932,6 +932,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { merge_prev = true; + vma_prev(vmi); } } /* Can we merge the successor? */ @@ -1023,9 +1024,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, validate_mm(mm); khugepaged_enter_vma(res, vm_flags); - if (res) - vma_iter_set(vmi, end); - return res; } From 132b39ac5c7ac9f34f3a6e8781a412c4dc58a864 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:14 -0800 Subject: [PATCH 079/143] dmapool: add alloc/free performance test Patch series "dmapool enhancements", v4. Time spent in dma_pool alloc/free increases linearly with the number of pages backing the pool. We can reduce this to constant time with minor changes to how free pages are tracked. This patch (of 12): Provide a module that allocates and frees many blocks of various sizes and report how long it takes. This is intended to provide a consistent way to measure how changes to the dma_pool_alloc/free routines affect timing. Link: https://lkml.kernel.org/r/20230126215125.4069751-1-kbusch@meta.com Link: https://lkml.kernel.org/r/20230126215125.4069751-2-kbusch@meta.com Signed-off-by: Keith Busch Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/Kconfig | 9 +++ mm/Makefile | 1 + mm/dmapool_test.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 mm/dmapool_test.c diff --git a/mm/Kconfig b/mm/Kconfig index 4751031f3f052..ca98b2072df5c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1100,6 +1100,15 @@ comment "GUP_TEST needs to have DEBUG_FS enabled" config GUP_GET_PXX_LOW_HIGH bool +config DMAPOOL_TEST + tristate "Enable a module to run time tests on dma_pool" + depends on HAS_DMA + help + Provides a test module that will allocate and free many blocks of + various sizes and report how long it takes. This is intended to + provide a consistent way to measure how changes to the + dma_pool_alloc/free routines affect performance. + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/Makefile b/mm/Makefile index 8e105e5b3e293..3a08f5d7b1782 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o +obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/dmapool_test.c b/mm/dmapool_test.c new file mode 100644 index 0000000000000..370fb9e209eff --- /dev/null +++ b/mm/dmapool_test.c @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include +#include +#include + +#define NR_TESTS (100) + +struct dma_pool_pair { + dma_addr_t dma; + void *v; +}; + +struct dmapool_parms { + size_t size; + size_t align; + size_t boundary; +}; + +static const struct dmapool_parms pool_parms[] = { + { .size = 16, .align = 16, .boundary = 0 }, + { .size = 64, .align = 64, .boundary = 0 }, + { .size = 256, .align = 256, .boundary = 0 }, + { .size = 1024, .align = 1024, .boundary = 0 }, + { .size = 4096, .align = 4096, .boundary = 0 }, + { .size = 68, .align = 32, .boundary = 4096 }, +}; + +static struct dma_pool *pool; +static struct device test_dev; +static u64 dma_mask; + +static inline int nr_blocks(int size) +{ + return clamp_t(int, (PAGE_SIZE / size) * 512, 1024, 8192); +} + +static int dmapool_test_alloc(struct dma_pool_pair *p, int blocks) +{ + int i; + + for (i = 0; i < blocks; i++) { + p[i].v = dma_pool_alloc(pool, GFP_KERNEL, + &p[i].dma); + if (!p[i].v) + goto pool_fail; + } + + for (i = 0; i < blocks; i++) + dma_pool_free(pool, p[i].v, p[i].dma); + + return 0; + +pool_fail: + for (--i; i >= 0; i--) + dma_pool_free(pool, p[i].v, p[i].dma); + return -ENOMEM; +} + +static int dmapool_test_block(const struct dmapool_parms *parms) +{ + int blocks = nr_blocks(parms->size); + ktime_t start_time, end_time; + struct dma_pool_pair *p; + int i, ret; + + p = kcalloc(blocks, sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + pool = dma_pool_create("test pool", &test_dev, parms->size, + parms->align, parms->boundary); + if (!pool) { + ret = -ENOMEM; + goto free_pairs; + } + + start_time = ktime_get(); + for (i = 0; i < NR_TESTS; i++) { + ret = dmapool_test_alloc(p, blocks); + if (ret) + goto free_pool; + if (need_resched()) + cond_resched(); + } + end_time = ktime_get(); + + printk("dmapool test: size:%-4zu align:%-4zu blocks:%-4d time:%llu\n", + parms->size, parms->align, blocks, + ktime_us_delta(end_time, start_time)); + +free_pool: + dma_pool_destroy(pool); +free_pairs: + kfree(p); + return ret; +} + +static void dmapool_test_release(struct device *dev) +{ +} + +static int dmapool_checks(void) +{ + int i, ret; + + ret = dev_set_name(&test_dev, "dmapool-test"); + if (ret) + return ret; + + ret = device_register(&test_dev); + if (ret) { + printk("%s: register failed:%d\n", __func__, ret); + goto put_device; + } + + test_dev.release = dmapool_test_release; + set_dma_ops(&test_dev, NULL); + test_dev.dma_mask = &dma_mask; + ret = dma_set_mask_and_coherent(&test_dev, DMA_BIT_MASK(64)); + if (ret) { + printk("%s: mask failed:%d\n", __func__, ret); + goto del_device; + } + + for (i = 0; i < ARRAY_SIZE(pool_parms); i++) { + ret = dmapool_test_block(&pool_parms[i]); + if (ret) + break; + } + +del_device: + device_del(&test_dev); +put_device: + put_device(&test_dev); + return ret; +} + +static void dmapool_exit(void) +{ +} + +module_init(dmapool_checks); +module_exit(dmapool_exit); +MODULE_LICENSE("GPL"); From 02f44680a35a091821053c4d5be80caace3cb423 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 26 Jan 2023 13:51:15 -0800 Subject: [PATCH 080/143] dmapool: remove checks for dev == NULL dmapool originally tried to support pools without a device because dma_alloc_coherent() supports allocations without a device. But nobody ended up using dma pools without a device, and trying to do so will result in an oops. So remove the checks for pool->dev == NULL since they are unneeded bloat. [kbusch@kernel.org: add check for null dev on create] Link: https://lkml.kernel.org/r/20230126215125.4069751-3-kbusch@meta.com Signed-off-by: Tony Battersby Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/dmapool.c | 45 ++++++++++++++------------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index a7eb5d0eb2da7..559207e1c3339 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -134,6 +134,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, size_t allocation; bool empty = false; + if (!dev) + return NULL; + if (align == 0) align = 1; else if (align & (align - 1)) @@ -275,7 +278,7 @@ void dma_pool_destroy(struct dma_pool *pool) mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); - if (pool->dev && list_empty(&pool->dev->dma_pools)) + if (list_empty(&pool->dev->dma_pools)) empty = true; mutex_unlock(&pools_lock); if (empty) @@ -284,12 +287,8 @@ void dma_pool_destroy(struct dma_pool *pool) list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { if (is_page_busy(page)) { - if (pool->dev) - dev_err(pool->dev, "%s %s, %p busy\n", __func__, - pool->name, page->vaddr); - else - pr_err("%s %s, %p busy\n", __func__, - pool->name, page->vaddr); + dev_err(pool->dev, "%s %s, %p busy\n", __func__, + pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); kfree(page); @@ -351,12 +350,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, for (i = sizeof(page->offset); i < pool->size; i++) { if (data[i] == POOL_POISON_FREED) continue; - if (pool->dev) - dev_err(pool->dev, "%s %s, %p (corrupted)\n", - __func__, pool->name, retval); - else - pr_err("%s %s, %p (corrupted)\n", - __func__, pool->name, retval); + dev_err(pool->dev, "%s %s, %p (corrupted)\n", + __func__, pool->name, retval); /* * Dump the first 4 bytes even if they are not @@ -411,12 +406,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) page = pool_find_page(pool, dma); if (!page) { spin_unlock_irqrestore(&pool->lock, flags); - if (pool->dev) - dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", - __func__, pool->name, vaddr, &dma); - else - pr_err("%s %s, %p/%pad (bad dma)\n", - __func__, pool->name, vaddr, &dma); + dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); return; } @@ -426,12 +417,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { spin_unlock_irqrestore(&pool->lock, flags); - if (pool->dev) - dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", - __func__, pool->name, vaddr, &dma); - else - pr_err("%s %s, %p (bad vaddr)/%pad\n", - __func__, pool->name, vaddr, &dma); + dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); return; } { @@ -442,12 +429,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) continue; } spin_unlock_irqrestore(&pool->lock, flags); - if (pool->dev) - dev_err(pool->dev, "%s %s, dma %pad already free\n", - __func__, pool->name, &dma); - else - pr_err("%s %s, dma %pad already free\n", - __func__, pool->name, &dma); + dev_err(pool->dev, "%s %s, dma %pad already free\n", + __func__, pool->name, &dma); return; } } From 490e678ed82af2459ebcabac6ca6aa90f7f570a0 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 26 Jan 2023 13:51:16 -0800 Subject: [PATCH 081/143] dmapool: use sysfs_emit() instead of scnprintf() Use sysfs_emit instead of scnprintf, snprintf or sprintf. Link: https://lkml.kernel.org/r/20230126215125.4069751-4-kbusch@meta.com Signed-off-by: Tony Battersby Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/dmapool.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 559207e1c3339..20616b760bb9c 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -64,18 +64,11 @@ static DEFINE_MUTEX(pools_reg_lock); static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned temp; - unsigned size; - char *next; + int size; struct dma_page *page; struct dma_pool *pool; - next = buf; - size = PAGE_SIZE; - - temp = scnprintf(next, size, "poolinfo - 0.1\n"); - size -= temp; - next += temp; + size = sysfs_emit(buf, "poolinfo - 0.1\n"); mutex_lock(&pools_lock); list_for_each_entry(pool, &dev->dma_pools, pools) { @@ -90,16 +83,14 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ - temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n", - pool->name, blocks, - pages * (pool->allocation / pool->size), - pool->size, pages); - size -= temp; - next += temp; + size += sysfs_emit_at(buf, size, "%-16s %4u %4zu %4zu %2u\n", + pool->name, blocks, + pages * (pool->allocation / pool->size), + pool->size, pages); } mutex_unlock(&pools_lock); - return PAGE_SIZE - size; + return size; } static DEVICE_ATTR_RO(pools); From bd0444e88a923743d1621c1f4192d96af3a9d9b8 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 26 Jan 2023 13:51:17 -0800 Subject: [PATCH 082/143] dmapool: cleanup integer types To represent the size of a single allocation, dmapool currently uses 'unsigned int' in some places and 'size_t' in other places. Standardize on 'unsigned int' to reduce overhead, but use 'size_t' when counting all the blocks in the entire pool. Link: https://lkml.kernel.org/r/20230126215125.4069751-5-kbusch@meta.com Signed-off-by: Tony Battersby Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/dmapool.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 20616b760bb9c..ee993bb59fc27 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -43,10 +43,10 @@ struct dma_pool { /* the pool */ struct list_head page_list; spinlock_t lock; - size_t size; struct device *dev; - size_t allocation; - size_t boundary; + unsigned int size; + unsigned int allocation; + unsigned int boundary; char name[32]; struct list_head pools; }; @@ -73,7 +73,7 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha mutex_lock(&pools_lock); list_for_each_entry(pool, &dev->dma_pools, pools) { unsigned pages = 0; - unsigned blocks = 0; + size_t blocks = 0; spin_lock_irq(&pool->lock); list_for_each_entry(page, &pool->page_list, page_list) { @@ -83,9 +83,10 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ - size += sysfs_emit_at(buf, size, "%-16s %4u %4zu %4zu %2u\n", + size += sysfs_emit_at(buf, size, "%-16s %4zu %4zu %4u %2u\n", pool->name, blocks, - pages * (pool->allocation / pool->size), + (size_t) pages * + (pool->allocation / pool->size), pool->size, pages); } mutex_unlock(&pools_lock); @@ -133,7 +134,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if (align & (align - 1)) return NULL; - if (size == 0) + if (size == 0 || size > INT_MAX) return NULL; else if (size < 4) size = 4; @@ -146,6 +147,8 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; + boundary = min(boundary, allocation); + retval = kmalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; @@ -306,7 +309,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, { unsigned long flags; struct dma_page *page; - size_t offset; + unsigned int offset; void *retval; might_alloc(mem_flags); From a45dca66446be01c5161830f80ae50ccd3083dca Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 26 Jan 2023 13:51:18 -0800 Subject: [PATCH 083/143] dmapool: speedup DMAPOOL_DEBUG with init_on_alloc Avoid double-memset of the same allocated memory in dma_pool_alloc() when both DMAPOOL_DEBUG is enabled and init_on_alloc=1. Link: https://lkml.kernel.org/r/20230126215125.4069751-6-kbusch@meta.com Signed-off-by: Tony Battersby Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index ee993bb59fc27..eaed3ffb42aa8 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -356,7 +356,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, break; } } - if (!(mem_flags & __GFP_ZERO)) + if (!want_init_on_alloc(mem_flags)) memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif spin_unlock_irqrestore(&pool->lock, flags); From 0638d79ce07a03d841eb776841691ffe2c57511b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:19 -0800 Subject: [PATCH 084/143] dmapool: move debug code to own functions Clean up the normal path by moving the debug code outside it. Link: https://lkml.kernel.org/r/20230126215125.4069751-7-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 128 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 51 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index eaed3ffb42aa8..30b069e999968 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -96,6 +96,78 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha static DEVICE_ATTR_RO(pools); +#ifdef DMAPOOL_DEBUG +static void pool_check_block(struct dma_pool *pool, void *retval, + unsigned int offset, gfp_t mem_flags) +{ + int i; + u8 *data = retval; + /* page->offset is stored in first 4 bytes */ + for (i = sizeof(offset); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + dev_err(pool->dev, "%s %s, %p (corrupted)\n", + __func__, pool->name, retval); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + if (!want_init_on_alloc(mem_flags)) + memset(retval, POOL_POISON_ALLOCATED, pool->size); +} + +static bool pool_page_err(struct dma_pool *pool, struct dma_page *page, + void *vaddr, dma_addr_t dma) +{ + unsigned int offset = vaddr - page->vaddr; + unsigned int chain = page->offset; + + if ((dma - page->dma) != offset) { + dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); + return true; + } + + while (chain < pool->allocation) { + if (chain != offset) { + chain = *(int *)(page->vaddr + chain); + continue; + } + dev_err(pool->dev, "%s %s, dma %pad already free\n", + __func__, pool->name, &dma); + return true; + } + memset(vaddr, POOL_POISON_FREED, pool->size); + return false; +} + +static void pool_init_page(struct dma_pool *pool, struct dma_page *page) +{ + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +} +#else +static void pool_check_block(struct dma_pool *pool, void *retval, + unsigned int offset, gfp_t mem_flags) + +{ +} + +static bool pool_page_err(struct dma_pool *pool, struct dma_page *page, + void *vaddr, dma_addr_t dma) +{ + return false; +} + +static void pool_init_page(struct dma_pool *pool, struct dma_page *page) +{ +} +#endif + /** * dma_pool_create - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics @@ -223,9 +295,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, &page->dma, mem_flags); if (page->vaddr) { -#ifdef DMAPOOL_DEBUG - memset(page->vaddr, POOL_POISON_FREED, pool->allocation); -#endif + pool_init_page(pool, page); pool_initialise_page(pool, page); page->in_use = 0; page->offset = 0; @@ -245,9 +315,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) { dma_addr_t dma = page->dma; -#ifdef DMAPOOL_DEBUG - memset(page->vaddr, POOL_POISON_FREED, pool->allocation); -#endif + pool_init_page(pool, page); dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); list_del(&page->page_list); kfree(page); @@ -336,29 +404,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, page->offset = *(int *)(page->vaddr + offset); retval = offset + page->vaddr; *handle = offset + page->dma; -#ifdef DMAPOOL_DEBUG - { - int i; - u8 *data = retval; - /* page->offset is stored in first 4 bytes */ - for (i = sizeof(page->offset); i < pool->size; i++) { - if (data[i] == POOL_POISON_FREED) - continue; - dev_err(pool->dev, "%s %s, %p (corrupted)\n", - __func__, pool->name, retval); - - /* - * Dump the first 4 bytes even if they are not - * POOL_POISON_FREED - */ - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, - data, pool->size, 1); - break; - } - } - if (!want_init_on_alloc(mem_flags)) - memset(retval, POOL_POISON_ALLOCATED, pool->size); -#endif + pool_check_block(pool, retval, offset, mem_flags); spin_unlock_irqrestore(&pool->lock, flags); if (want_init_on_alloc(mem_flags)) @@ -394,7 +440,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) { struct dma_page *page; unsigned long flags; - unsigned int offset; spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); @@ -405,35 +450,16 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) return; } - offset = vaddr - page->vaddr; if (want_init_on_free()) memset(vaddr, 0, pool->size); -#ifdef DMAPOOL_DEBUG - if ((dma - page->dma) != offset) { + if (pool_page_err(pool, page, vaddr, dma)) { spin_unlock_irqrestore(&pool->lock, flags); - dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", - __func__, pool->name, vaddr, &dma); return; } - { - unsigned int chain = page->offset; - while (chain < pool->allocation) { - if (chain != offset) { - chain = *(int *)(page->vaddr + chain); - continue; - } - spin_unlock_irqrestore(&pool->lock, flags); - dev_err(pool->dev, "%s %s, dma %pad already free\n", - __func__, pool->name, &dma); - return; - } - } - memset(vaddr, POOL_POISON_FREED, pool->size); -#endif page->in_use--; *(int *)vaddr = page->offset; - page->offset = offset; + page->offset = vaddr - page->vaddr; /* * Resist a temptation to do * if (!is_page_busy(page)) pool_free_page(pool, page); From d48f62895de1b49e5865d93c9ccf3c899e17d209 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:20 -0800 Subject: [PATCH 085/143] dmapool: rearrange page alloc failure handling Handle the error in a condition so the good path can be in the normal flow. Link: https://lkml.kernel.org/r/20230126215125.4069751-8-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 30b069e999968..900f2afa363a9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -292,17 +292,19 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) page = kmalloc(sizeof(*page), mem_flags); if (!page) return NULL; + page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, &page->dma, mem_flags); - if (page->vaddr) { - pool_init_page(pool, page); - pool_initialise_page(pool, page); - page->in_use = 0; - page->offset = 0; - } else { + if (!page->vaddr) { kfree(page); - page = NULL; + return NULL; } + + pool_init_page(pool, page); + pool_initialise_page(pool, page); + page->in_use = 0; + page->offset = 0; + return page; } From d6c23cd766f29fad4f5d2887709eca002b650d94 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:21 -0800 Subject: [PATCH 086/143] dmapool: consolidate page initialization Various fields of the dma pool are set in different places. Move it all to one function. Link: https://lkml.kernel.org/r/20230126215125.4069751-9-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 900f2afa363a9..9e98065a68b1f 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -274,6 +274,9 @@ static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) unsigned int offset = 0; unsigned int next_boundary = pool->boundary; + pool_init_page(pool, page); + page->in_use = 0; + page->offset = 0; do { unsigned int next = offset + pool->size; if (unlikely((next + pool->size) >= next_boundary)) { @@ -300,11 +303,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) return NULL; } - pool_init_page(pool, page); pool_initialise_page(pool, page); - page->in_use = 0; - page->offset = 0; - return page; } From a25755933c3f40e98c1376e1017a3b418868a517 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:22 -0800 Subject: [PATCH 087/143] dmapool: simplify freeing The actions for busy and not busy are mostly the same, so combine these and remove the unnecessary function. Also, the pool is about to be freed so there's no need to poison the page data since we only check for poison on alloc, which can't be done on a freed pool. Link: https://lkml.kernel.org/r/20230126215125.4069751-10-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 9e98065a68b1f..4dea2a0dbd336 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -312,16 +312,6 @@ static inline bool is_page_busy(struct dma_page *page) return page->in_use != 0; } -static void pool_free_page(struct dma_pool *pool, struct dma_page *page) -{ - dma_addr_t dma = page->dma; - - pool_init_page(pool, page); - dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); - list_del(&page->page_list); - kfree(page); -} - /** * dma_pool_destroy - destroys a pool of dma memory blocks. * @pool: dma pool that will be destroyed @@ -349,14 +339,14 @@ void dma_pool_destroy(struct dma_pool *pool) mutex_unlock(&pools_reg_lock); list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { - if (is_page_busy(page)) { + if (!is_page_busy(page)) + dma_free_coherent(pool->dev, pool->allocation, + page->vaddr, page->dma); + else dev_err(pool->dev, "%s %s, %p busy\n", __func__, pool->name, page->vaddr); - /* leak the still-in-use consistent memory */ - list_del(&page->page_list); - kfree(page); - } else - pool_free_page(pool, page); + list_del(&page->page_list); + kfree(page); } kfree(pool); From b03f5403aebf0015b9ee26750d9dde79efd20765 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:23 -0800 Subject: [PATCH 088/143] dmapool: don't memset on free twice If debug is enabled, dmapool will poison the range, so no need to clear it to 0 immediately before writing over it. Link: https://lkml.kernel.org/r/20230126215125.4069751-11-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 4dea2a0dbd336..21e6d362c7264 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -160,6 +160,8 @@ static void pool_check_block(struct dma_pool *pool, void *retval, static bool pool_page_err(struct dma_pool *pool, struct dma_page *page, void *vaddr, dma_addr_t dma) { + if (want_init_on_free()) + memset(vaddr, 0, pool->size); return false; } @@ -441,8 +443,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) return; } - if (want_init_on_free()) - memset(vaddr, 0, pool->size); if (pool_page_err(pool, page, vaddr, dma)) { spin_unlock_irqrestore(&pool->lock, flags); return; From 67f2a50b67713d70b7211018f6294617f4ebee05 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:24 -0800 Subject: [PATCH 089/143] dmapool: link blocks across pages The allocated dmapool pages are never freed for the lifetime of the pool. There is no need for the two level list+stack lookup for finding a free block since nothing is ever removed from the list. Just use a simple stack, reducing time complexity to constant. The implementation inserts the stack linking elements and the dma handle of the block within itself when freed. This means the smallest possible dmapool block is increased to at most 16 bytes to accommodate these fields, but there are no exisiting users requesting a dma pool smaller than that anyway. Removing the list has a significant change in performance. Using the kernel's micro-benchmarking self test: Before: # modprobe dmapool_test dmapool test: size:16 blocks:8192 time:57282 dmapool test: size:64 blocks:8192 time:172562 dmapool test: size:256 blocks:8192 time:789247 dmapool test: size:1024 blocks:2048 time:371823 dmapool test: size:4096 blocks:1024 time:362237 After: # modprobe dmapool_test dmapool test: size:16 blocks:8192 time:24997 dmapool test: size:64 blocks:8192 time:26584 dmapool test: size:256 blocks:8192 time:33542 dmapool test: size:1024 blocks:2048 time:9022 dmapool test: size:4096 blocks:1024 time:6045 The module test allocates quite a few blocks that may not accurately represent how these pools are used in real life. For a more marco level benchmark, running fio high-depth + high-batched on nvme, this patch shows submission and completion latency reduced by ~100usec each, 1% IOPs improvement, and perf record's time spent in dma_pool_alloc/free were reduced by half. Link: https://lkml.kernel.org/r/20230126215125.4069751-12-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 246 +++++++++++++++++++++++++-------------------------- 1 file changed, 119 insertions(+), 127 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 21e6d362c7264..bb8893b4f4b96 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -15,7 +15,7 @@ * represented by the 'struct dma_pool' which keeps a doubly-linked list of * allocated pages. Each page in the page_list is split into blocks of at * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked - * list of free blocks within the page. Used blocks aren't tracked, but we + * list of free blocks across all pages. Used blocks aren't tracked, but we * keep a count of how many are currently allocated from each page. */ @@ -40,9 +40,18 @@ #define DMAPOOL_DEBUG 1 #endif +struct dma_block { + struct dma_block *next_block; + dma_addr_t dma; +}; + struct dma_pool { /* the pool */ struct list_head page_list; spinlock_t lock; + struct dma_block *next_block; + size_t nr_blocks; + size_t nr_active; + size_t nr_pages; struct device *dev; unsigned int size; unsigned int allocation; @@ -55,8 +64,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ struct list_head page_list; void *vaddr; dma_addr_t dma; - unsigned int in_use; - unsigned int offset; }; static DEFINE_MUTEX(pools_lock); @@ -64,30 +71,18 @@ static DEFINE_MUTEX(pools_reg_lock); static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf) { - int size; - struct dma_page *page; struct dma_pool *pool; + unsigned size; size = sysfs_emit(buf, "poolinfo - 0.1\n"); mutex_lock(&pools_lock); list_for_each_entry(pool, &dev->dma_pools, pools) { - unsigned pages = 0; - size_t blocks = 0; - - spin_lock_irq(&pool->lock); - list_for_each_entry(page, &pool->page_list, page_list) { - pages++; - blocks += page->in_use; - } - spin_unlock_irq(&pool->lock); - /* per-pool info, no real statistics yet */ - size += sysfs_emit_at(buf, size, "%-16s %4zu %4zu %4u %2u\n", - pool->name, blocks, - (size_t) pages * - (pool->allocation / pool->size), - pool->size, pages); + size += sysfs_emit_at(buf, size, "%-16s %4zu %4zu %4u %2zu\n", + pool->name, pool->nr_active, + pool->nr_blocks, pool->size, + pool->nr_pages); } mutex_unlock(&pools_lock); @@ -97,17 +92,17 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha static DEVICE_ATTR_RO(pools); #ifdef DMAPOOL_DEBUG -static void pool_check_block(struct dma_pool *pool, void *retval, - unsigned int offset, gfp_t mem_flags) +static void pool_check_block(struct dma_pool *pool, struct dma_block *block, + gfp_t mem_flags) { + u8 *data = (void *)block; int i; - u8 *data = retval; - /* page->offset is stored in first 4 bytes */ - for (i = sizeof(offset); i < pool->size; i++) { + + for (i = sizeof(struct dma_block); i < pool->size; i++) { if (data[i] == POOL_POISON_FREED) continue; - dev_err(pool->dev, "%s %s, %p (corrupted)\n", - __func__, pool->name, retval); + dev_err(pool->dev, "%s %s, %p (corrupted)\n", __func__, + pool->name, block); /* * Dump the first 4 bytes even if they are not @@ -117,31 +112,46 @@ static void pool_check_block(struct dma_pool *pool, void *retval, data, pool->size, 1); break; } + if (!want_init_on_alloc(mem_flags)) - memset(retval, POOL_POISON_ALLOCATED, pool->size); + memset(block, POOL_POISON_ALLOCATED, pool->size); +} + +static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) +{ + struct dma_page *page; + + list_for_each_entry(page, &pool->page_list, page_list) { + if (dma < page->dma) + continue; + if ((dma - page->dma) < pool->allocation) + return page; + } + return NULL; } -static bool pool_page_err(struct dma_pool *pool, struct dma_page *page, - void *vaddr, dma_addr_t dma) +static bool pool_block_err(struct dma_pool *pool, void *vaddr, dma_addr_t dma) { - unsigned int offset = vaddr - page->vaddr; - unsigned int chain = page->offset; + struct dma_block *block = pool->next_block; + struct dma_page *page; - if ((dma - page->dma) != offset) { - dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", + page = pool_find_page(pool, dma); + if (!page) { + dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", __func__, pool->name, vaddr, &dma); return true; } - while (chain < pool->allocation) { - if (chain != offset) { - chain = *(int *)(page->vaddr + chain); + while (block) { + if (block != vaddr) { + block = block->next_block; continue; } dev_err(pool->dev, "%s %s, dma %pad already free\n", __func__, pool->name, &dma); return true; } + memset(vaddr, POOL_POISON_FREED, pool->size); return false; } @@ -151,14 +161,12 @@ static void pool_init_page(struct dma_pool *pool, struct dma_page *page) memset(page->vaddr, POOL_POISON_FREED, pool->allocation); } #else -static void pool_check_block(struct dma_pool *pool, void *retval, - unsigned int offset, gfp_t mem_flags) - +static void pool_check_block(struct dma_pool *pool, struct dma_block *block, + gfp_t mem_flags) { } -static bool pool_page_err(struct dma_pool *pool, struct dma_page *page, - void *vaddr, dma_addr_t dma) +static bool pool_block_err(struct dma_pool *pool, void *vaddr, dma_addr_t dma) { if (want_init_on_free()) memset(vaddr, 0, pool->size); @@ -170,6 +178,26 @@ static void pool_init_page(struct dma_pool *pool, struct dma_page *page) } #endif +static struct dma_block *pool_block_pop(struct dma_pool *pool) +{ + struct dma_block *block = pool->next_block; + + if (block) { + pool->next_block = block->next_block; + pool->nr_active++; + } + return block; +} + +static void pool_block_push(struct dma_pool *pool, struct dma_block *block, + dma_addr_t dma) +{ + block->dma = dma; + block->next_block = pool->next_block; + pool->next_block = block; +} + + /** * dma_pool_create - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics @@ -210,8 +238,8 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, if (size == 0 || size > INT_MAX) return NULL; - else if (size < 4) - size = 4; + if (size < sizeof(struct dma_block)) + size = sizeof(struct dma_block); size = ALIGN(size, align); allocation = max_t(size_t, size, PAGE_SIZE); @@ -223,7 +251,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kmalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; @@ -236,7 +264,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; - INIT_LIST_HEAD(&retval->pools); /* @@ -273,21 +300,25 @@ EXPORT_SYMBOL(dma_pool_create); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { - unsigned int offset = 0; - unsigned int next_boundary = pool->boundary; + unsigned int next_boundary = pool->boundary, offset = 0; + struct dma_block *block; pool_init_page(pool, page); - page->in_use = 0; - page->offset = 0; - do { - unsigned int next = offset + pool->size; - if (unlikely((next + pool->size) >= next_boundary)) { - next = next_boundary; + while (offset + pool->size <= pool->allocation) { + if (offset + pool->size > next_boundary) { + offset = next_boundary; next_boundary += pool->boundary; + continue; } - *(int *)(page->vaddr + offset) = next; - offset = next; - } while (offset < pool->allocation); + + block = page->vaddr + offset; + pool_block_push(pool, block, page->dma + offset); + offset += pool->size; + pool->nr_blocks++; + } + + list_add(&page->page_list, &pool->page_list); + pool->nr_pages++; } static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) @@ -305,15 +336,9 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) return NULL; } - pool_initialise_page(pool, page); return page; } -static inline bool is_page_busy(struct dma_page *page) -{ - return page->in_use != 0; -} - /** * dma_pool_destroy - destroys a pool of dma memory blocks. * @pool: dma pool that will be destroyed @@ -325,7 +350,7 @@ static inline bool is_page_busy(struct dma_page *page) void dma_pool_destroy(struct dma_pool *pool) { struct dma_page *page, *tmp; - bool empty = false; + bool empty = false, busy = false; if (unlikely(!pool)) return; @@ -340,13 +365,15 @@ void dma_pool_destroy(struct dma_pool *pool) device_remove_file(pool->dev, &dev_attr_pools); mutex_unlock(&pools_reg_lock); + if (pool->nr_active) { + dev_err(pool->dev, "%s %s busy\n", __func__, pool->name); + busy = true; + } + list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { - if (!is_page_busy(page)) + if (!busy) dma_free_coherent(pool->dev, pool->allocation, page->vaddr, page->dma); - else - dev_err(pool->dev, "%s %s, %p busy\n", __func__, - pool->name, page->vaddr); list_del(&page->page_list); kfree(page); } @@ -368,58 +395,40 @@ EXPORT_SYMBOL(dma_pool_destroy); void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { - unsigned long flags; + struct dma_block *block; struct dma_page *page; - unsigned int offset; - void *retval; + unsigned long flags; might_alloc(mem_flags); spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry(page, &pool->page_list, page_list) { - if (page->offset < pool->allocation) - goto ready; - } - - /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ - spin_unlock_irqrestore(&pool->lock, flags); - - page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); - if (!page) - return NULL; + block = pool_block_pop(pool); + if (!block) { + /* + * pool_alloc_page() might sleep, so temporarily drop + * &pool->lock + */ + spin_unlock_irqrestore(&pool->lock, flags); - spin_lock_irqsave(&pool->lock, flags); + page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); + if (!page) + return NULL; - list_add(&page->page_list, &pool->page_list); - ready: - page->in_use++; - offset = page->offset; - page->offset = *(int *)(page->vaddr + offset); - retval = offset + page->vaddr; - *handle = offset + page->dma; - pool_check_block(pool, retval, offset, mem_flags); + spin_lock_irqsave(&pool->lock, flags); + pool_initialise_page(pool, page); + block = pool_block_pop(pool); + } spin_unlock_irqrestore(&pool->lock, flags); + *handle = block->dma; + pool_check_block(pool, block, mem_flags); if (want_init_on_alloc(mem_flags)) - memset(retval, 0, pool->size); + memset(block, 0, pool->size); - return retval; + return block; } EXPORT_SYMBOL(dma_pool_alloc); -static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) -{ - struct dma_page *page; - - list_for_each_entry(page, &pool->page_list, page_list) { - if (dma < page->dma) - continue; - if ((dma - page->dma) < pool->allocation) - return page; - } - return NULL; -} - /** * dma_pool_free - put block back into dma pool * @pool: the dma pool holding the block @@ -431,31 +440,14 @@ static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) */ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) { - struct dma_page *page; + struct dma_block *block = vaddr; unsigned long flags; spin_lock_irqsave(&pool->lock, flags); - page = pool_find_page(pool, dma); - if (!page) { - spin_unlock_irqrestore(&pool->lock, flags); - dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", - __func__, pool->name, vaddr, &dma); - return; + if (!pool_block_err(pool, vaddr, dma)) { + pool_block_push(pool, block, dma); + pool->nr_active--; } - - if (pool_page_err(pool, page, vaddr, dma)) { - spin_unlock_irqrestore(&pool->lock, flags); - return; - } - - page->in_use--; - *(int *)vaddr = page->offset; - page->offset = vaddr - page->vaddr; - /* - * Resist a temptation to do - * if (!is_page_busy(page)) pool_free_page(pool, page); - * Better have a few empty pages hang around. - */ spin_unlock_irqrestore(&pool->lock, flags); } EXPORT_SYMBOL(dma_pool_free); From 1a8840878a8f8b23b4ec088124b0b35b9190ab27 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jan 2023 13:51:25 -0800 Subject: [PATCH 090/143] dmapool: create/destroy cleanup Set the 'empty' bool directly from the result of the function that determines its value instead of adding additional logic. Link: https://lkml.kernel.org/r/20230126215125.4069751-13-kbusch@meta.com Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Tony Battersby Signed-off-by: Andrew Morton --- mm/dmapool.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index bb8893b4f4b96..1920890ff8d3d 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -226,7 +226,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; - bool empty = false; + bool empty; if (!dev) return NULL; @@ -276,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, */ mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - empty = true; + empty = list_empty(&dev->dma_pools); list_add(&retval->pools, &dev->dma_pools); mutex_unlock(&pools_lock); if (empty) { @@ -350,7 +349,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) void dma_pool_destroy(struct dma_pool *pool) { struct dma_page *page, *tmp; - bool empty = false, busy = false; + bool empty, busy = false; if (unlikely(!pool)) return; @@ -358,8 +357,7 @@ void dma_pool_destroy(struct dma_pool *pool) mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); - if (list_empty(&pool->dev->dma_pools)) - empty = true; + empty = list_empty(&pool->dev->dma_pools); mutex_unlock(&pools_lock); if (empty) device_remove_file(pool->dev, &dev_attr_pools); From 128d76601142cdc131d17861a4a9ab3e24f71ac0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:46 -0800 Subject: [PATCH 091/143] kernel/fork: convert vma assignment to a memcpy Patch series "introduce vm_flags modifier functions", v4. This patchset was originally published as a part of per-VMA locking [1] and was split after suggestion that it's viable on its own and to facilitate the review process. It is now a preprequisite for the next version of per-VMA lock patchset, which reuses vm_flags modifier functions to lock the VMA when vm_flags are being updated. VMA vm_flags modifications are usually done under exclusive mmap_lock protection because this attrubute affects other decisions like VMA merging or splitting and races should be prevented. Introduce vm_flags modifier functions to enforce correct locking. This patch (of 7): Convert vma assignment in vm_area_dup() to a memcpy() to prevent compiler errors when we add a const modifier to vma->vm_flags. Link: https://lkml.kernel.org/r/20230126193752.297968-1-surenb@google.com Link: https://lkml.kernel.org/r/20230126193752.297968-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Cc: Sebastian Reichel Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 441dcec60aae7..9260f975b8f44 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * orig->shared.rb may be modified concurrently, but the clone * will be reinitialized. */ - *new = data_race(*orig); + data_race(memcpy(new, orig, sizeof(*new))); INIT_LIST_HEAD(&new->anon_vma_chain); dup_anon_vma_name(orig, new); } From 139383b2d49d611725851230c99d1ac9063e612c Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:47 -0800 Subject: [PATCH 092/143] mm: introduce vma->vm_flags wrapper functions vm_flags are among VMA attributes which affect decisions like VMA merging and splitting. Therefore all vm_flags modifications are performed after taking exclusive mmap_lock to prevent vm_flags updates racing with such operations. Introduce modifier functions for vm_flags to be used whenever flags are updated. This way we can better check and control correct locking behavior during these updates. Link: https://lkml.kernel.org/r/20230126193752.297968-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Davidlohr Bueso Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 10 +++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a87cfab250868..a9b452a95d365 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -633,6 +633,46 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); } +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* Use when VMA is part of the VMA tree and modifications need coordination */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2d6d790d9bed8..da983aedb7413 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -491,7 +491,15 @@ struct vm_area_struct { * See vmf_insert_mixed_prot() for discussion. */ pgprot_t vm_page_prot; - unsigned long vm_flags; /* Flags, see mm.h. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; /* * For areas with an address space and backing store, From 96801704e51f5c2208fd9e09e6415b391b45c1b6 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:48 -0800 Subject: [PATCH 093/143] mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK To simplify the usage of VM_LOCKED_CLEAR_MASK in vm_flags_clear(), replace it with VM_LOCKED_MASK bitmask and convert all users. Link: https://lkml.kernel.org/r/20230126193752.297968-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Reviewed-by: Davidlohr Bueso Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- kernel/fork.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 6 +++--- mm/mmap.c | 6 +++--- mm/mremap.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a9b452a95d365..fa952ca3b8ed0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -427,8 +427,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR diff --git a/kernel/fork.c b/kernel/fork.c index 9260f975b8f44..5e3029ea8e1ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b6cbba105ffc0..3a01a9dbf445a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6970,8 +6970,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; - unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 0336f52e03d75..5c4fff93cd6bc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (vma->vm_start != tmp) return -ENOMEM; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; @@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags) struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; - current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; @@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags) for_each_vma(vmi, vma) { vm_flags_t newflags; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ diff --git a/mm/mmap.c b/mm/mmap.c index b698a96d05116..03d7c37c59691 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2668,7 +2668,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -3338,8 +3338,8 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_init(vma, (vm_flags | mm->def_flags | + VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mremap.c b/mm/mremap.c index 5c9a579098628..d70d8063c6e20 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -687,7 +687,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); /* * anon_vma links of the old vma is no longer needed after its page From 81ce59e4e177f1e15243f13325c86359a35f7f84 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:49 -0800 Subject: [PATCH 094/143] mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Sebastian Reichel Reviewed-by: Liam R. Howlett Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/kernel/process.c | 2 +- arch/ia64/mm/init.c | 8 ++++---- arch/loongarch/include/asm/tlb.h | 2 +- arch/powerpc/kvm/book3s_xive_native.c | 2 +- arch/powerpc/mm/book3s64/subpage_prot.c | 2 +- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/cell/spufs/file.c | 14 +++++++------- arch/s390/mm/gmap.c | 3 +-- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/kernel/cpu/sgx/driver.c | 2 +- arch/x86/kernel/cpu/sgx/virt.c | 2 +- arch/x86/mm/pat/memtype.c | 6 +++--- arch/x86/um/mem_32.c | 2 +- drivers/acpi/pfr_telemetry.c | 2 +- drivers/android/binder.c | 3 +-- drivers/char/mspec.c | 2 +- drivers/crypto/hisilicon/qm.c | 2 +- drivers/dax/device.c | 2 +- drivers/dma/idxd/cdev.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/drm_gem_dma_helper.c | 3 +-- drivers/gpu/drm/drm_gem_shmem_helper.c | 2 +- drivers/gpu/drm/drm_vm.c | 8 ++++---- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- drivers/gpu/drm/exynos/exynos_drm_gem.c | 4 ++-- drivers/gpu/drm/gma500/framebuffer.c | 2 +- drivers/gpu/drm/i810/i810_dma.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 ++-- drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +- drivers/gpu/drm/msm/msm_gem.c | 2 +- drivers/gpu/drm/omapdrm/omap_gem.c | 3 +-- drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 3 +-- drivers/gpu/drm/tegra/gem.c | 5 ++--- drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 +-- drivers/gpu/drm/virtio/virtgpu_vram.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 2 +- drivers/gpu/drm/xen/xen_drm_front_gem.c | 3 +-- drivers/hsi/clients/cmt_speech.c | 2 +- drivers/hwtracing/intel_th/msu.c | 2 +- drivers/hwtracing/stm/core.c | 2 +- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- drivers/infiniband/hw/mlx5/main.c | 4 ++-- drivers/infiniband/hw/qib/qib_file_ops.c | 13 ++++++------- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 2 +- .../media/common/videobuf2/videobuf2-dma-contig.c | 2 +- drivers/media/common/videobuf2/videobuf2-vmalloc.c | 2 +- drivers/media/v4l2-core/videobuf-dma-contig.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 4 ++-- drivers/media/v4l2-core/videobuf-vmalloc.c | 2 +- drivers/misc/cxl/context.c | 2 +- drivers/misc/habanalabs/common/memory.c | 2 +- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 ++++---- drivers/misc/habanalabs/goya/goya.c | 4 ++-- drivers/misc/ocxl/context.c | 4 ++-- drivers/misc/ocxl/sysfs.c | 2 +- drivers/misc/open-dice.c | 4 ++-- drivers/misc/sgi-gru/grufile.c | 4 ++-- drivers/misc/uacce/uacce.c | 2 +- drivers/sbus/char/oradax.c | 2 +- drivers/scsi/cxlflash/ocxl_hw.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/staging/media/atomisp/pci/hmm/hmm_bo.c | 2 +- drivers/staging/media/deprecated/meye/meye.c | 4 ++-- .../media/deprecated/stkwebcam/stk-webcam.c | 2 +- drivers/target/target_core_user.c | 2 +- drivers/uio/uio.c | 2 +- drivers/usb/core/devio.c | 3 +-- drivers/usb/mon/mon_bin.c | 3 +-- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 2 +- drivers/vhost/vdpa.c | 2 +- drivers/video/fbdev/68328fb.c | 2 +- drivers/video/fbdev/core/fb_defio.c | 4 ++-- drivers/xen/gntalloc.c | 2 +- drivers/xen/gntdev.c | 4 ++-- drivers/xen/privcmd-buf.c | 2 +- drivers/xen/privcmd.c | 4 ++-- fs/aio.c | 2 +- fs/cramfs/inode.c | 2 +- fs/erofs/data.c | 2 +- fs/exec.c | 4 ++-- fs/ext4/file.c | 2 +- fs/fuse/dax.c | 2 +- fs/hugetlbfs/inode.c | 4 ++-- fs/orangefs/file.c | 3 +-- fs/proc/task_mmu.c | 2 +- fs/proc/vmcore.c | 3 +-- fs/userfaultfd.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/mm.h | 2 +- kernel/bpf/ringbuf.c | 4 ++-- kernel/bpf/syscall.c | 4 ++-- kernel/events/core.c | 2 +- kernel/kcov.c | 2 +- kernel/relay.c | 2 +- mm/madvise.c | 2 +- mm/memory.c | 6 +++--- mm/mlock.c | 6 +++--- mm/mmap.c | 10 +++++----- mm/mprotect.c | 2 +- mm/mremap.c | 6 +++--- mm/nommu.c | 11 ++++++----- mm/secretmem.c | 2 +- mm/shmem.c | 2 +- mm/vmalloc.c | 2 +- net/ipv4/tcp.c | 4 ++-- security/selinux/selinuxfs.c | 6 +++--- sound/core/oss/pcm_oss.c | 2 +- sound/core/pcm_native.c | 9 +++++---- sound/soc/pxa/mmp-sspa.c | 2 +- sound/usb/usx2y/us122l.c | 4 ++-- sound/usb/usx2y/usX2Yhwdep.c | 2 +- sound/usb/usx2y/usx2yhwdeppcm.c | 2 +- 120 files changed, 188 insertions(+), 199 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f811733a8fc57..61c30b9a24eac 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -316,7 +316,7 @@ static int __init gate_vma_init(void) gate_vma.vm_page_prot = PAGE_READONLY_EXEC; gate_vma.vm_start = 0xffff0000; gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; - gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC); return 0; } arch_initcall(gate_vma_init); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index fc4e4217e87ff..7f5353e28516f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -109,7 +109,7 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; + vm_flags_init(vma, VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { @@ -127,8 +127,8 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_init(vma, VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { mmap_write_unlock(current->mm); @@ -272,7 +272,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX); return 0; diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index dd24f5898f651..f5e4deb97402f 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -149,7 +149,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) struct vm_area_struct vma; vma.vm_mm = tlb->mm; - vma.vm_flags = 0; + vm_flags_init(&vma, 0); if (tlb->fullmm) { flush_tlb_mm(tlb->mm); return; diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 4f566bea5e10f..712ab91ced398 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -324,7 +324,7 @@ static int kvmppc_xive_native_mmap(struct kvm_device *dev, return -EINVAL; } - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); /* diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index d73b3b4176e81..b75a9fb99599a 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -156,7 +156,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, * VM_NOHUGEPAGE and split them. */ for_each_vma_range(vmi, vma, addr + len) { - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); } } diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 9580e8e12165c..36c21648d19a3 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -525,7 +525,7 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) pfn = paste_addr >> PAGE_SHIFT; /* flags, page_prot from cxl_mmap(), except we want cachable */ - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_cached(vma->vm_page_prot); prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 62d90a5e23d1e..02a8158c469dd 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -291,7 +291,7 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); vma->vm_ops = &spufs_mem_mmap_vmops; @@ -381,7 +381,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_cntl_mmap_vmops; @@ -1043,7 +1043,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal1_mmap_vmops; @@ -1179,7 +1179,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal2_mmap_vmops; @@ -1302,7 +1302,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mss_mmap_vmops; @@ -1364,7 +1364,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_psmap_mmap_vmops; @@ -1424,7 +1424,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mfc_mmap_vmops; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 69af6cdf1a2ad..ab836597419dc 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2522,8 +2522,7 @@ static inline void thp_split_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 4af81df133ee8..d234ca797e4a7 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -391,7 +391,7 @@ void __init map_vsyscall(void) } if (vsyscall_mode == XONLY) - gate_vma.vm_flags = VM_EXEC; + vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index aa9b8b8688676..262f5fb18d74d 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma->vm_ops = &sgx_vm_ops; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); vma->vm_private_data = encl; return 0; diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 6a77a14eee38c..c3e37eaec8ecd 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &sgx_vepc_vm_ops; /* Don't copy VMA in fork() */ - vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); vma->vm_private_data = vepc; return 0; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index fb4b1b5e0deab..6ca51b1aa5d98 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1000,7 +1000,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, ret = reserve_pfn_range(paddr, size, prot, 0); if (ret == 0 && vma) - vma->vm_flags |= VM_PAT; + vm_flags_set(vma, VM_PAT); return ret; } @@ -1066,7 +1066,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } free_pfn_range(paddr, size); if (vma) - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } /* @@ -1076,7 +1076,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, */ void untrack_pfn_moved(struct vm_area_struct *vma) { - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index cafd01f730dab..29b2203bc82cb 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = PAGE_READONLY; return 0; diff --git a/drivers/acpi/pfr_telemetry.c b/drivers/acpi/pfr_telemetry.c index 27fb6cdad75f9..843f678ade0c2 100644 --- a/drivers/acpi/pfr_telemetry.c +++ b/drivers/acpi/pfr_telemetry.c @@ -310,7 +310,7 @@ pfrt_log_mmap(struct file *file, struct vm_area_struct *vma) return -EROFS; /* changing from read to write with mprotect is not allowed */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); pfrt_log_dev = to_pfrt_log_dev(file); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 880224ec6abb8..cb08982b9666d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5572,8 +5572,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } - vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f8231e2e84bec..b35f651837c83 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -206,7 +206,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, refcount_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); if (vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 007ac7a69ce74..733fe10339104 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2363,7 +2363,7 @@ static int hisi_qm_uacce_mmap(struct uacce_queue *q, return -EINVAL; } - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); return remap_pfn_range(vma, vma->vm_start, phys_base >> PAGE_SHIFT, diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 5494d745ced58..223e4e233d19f 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -308,7 +308,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index e13e92609943d..674bfefca088d 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -201,7 +201,7 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) if (rc < 0) return rc; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index bb7350ea1d759..a69fd6fdabb4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -257,7 +257,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str */ if (is_cow_mapping(vma->vm_flags) && !(vma->vm_flags & VM_ACCESS_FLAGS)) - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return drm_gem_ttm_mmap(obj, vma); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6d291aa6386bd..d0933dd9af06d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2879,8 +2879,8 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, address = dev->adev->rmmio_remap.bus_addr; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index cd4e61bf04939..cbef2e147da58 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -159,8 +159,8 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, address = kfd_get_process_doorbells(pdd); if (!address) return -ENOMEM; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 729d26d648af3..dd0436bf349a4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1052,8 +1052,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) pfn = __pa(page->kernel_address); pfn >>= PAGE_SHIFT; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE - | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE + | VM_DONTDUMP | VM_PFNMAP); pr_debug("Mapping signal page\n"); pr_debug(" start user address == 0x%08lx\n", vma->vm_start); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 51b1683ac5c1e..1fad0ecdfaeb7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1978,8 +1978,8 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, return -ENOMEM; } - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND - | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP); /* Mapping pages to user process */ return remap_pfn_range(vma, vma->vm_start, PFN_DOWN(__pa(qpd->cwsr_kaddr)), diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index b8db675e7fb5e..54c76003d2cce 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1047,7 +1047,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size, goto err_drm_gem_object_put; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); } diff --git a/drivers/gpu/drm/drm_gem_dma_helper.c b/drivers/gpu/drm/drm_gem_dma_helper.c index 1e658c4483668..fb2c764accc63 100644 --- a/drivers/gpu/drm/drm_gem_dma_helper.c +++ b/drivers/gpu/drm/drm_gem_dma_helper.c @@ -530,8 +530,7 @@ int drm_gem_dma_mmap(struct drm_gem_dma_object *dma_obj, struct vm_area_struct * * the whole buffer. */ vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTEXPAND, VM_PFNMAP); if (dma_obj->map_noncoherent) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index b602cd72a1205..a2c28483e0103 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -633,7 +633,7 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct if (ret) return ret; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); if (shmem->map_wc) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index f024dc93939ef..87c9fe55dec76 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -476,7 +476,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) if (!capable(CAP_SYS_ADMIN) && (dma->flags & _DRM_DMA_USE_PCI_RO)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -492,7 +492,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &drm_vm_dma_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; @@ -560,7 +560,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) return -EINVAL; if (!capable(CAP_SYS_ADMIN) && (map->flags & _DRM_READ_ONLY)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -628,7 +628,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) default: return -EINVAL; /* This should never happen. */ } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index c5ae5492e1af0..b5f73502e3dd4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -130,7 +130,7 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj, { pgprot_t vm_page_prot; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 3e493f48e0d44..638ca96830e9d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -274,7 +274,7 @@ static int exynos_drm_gem_mmap_buffer(struct exynos_drm_gem *exynos_gem, unsigned long vm_size; int ret; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; vm_size = vma->vm_end - vma->vm_start; @@ -368,7 +368,7 @@ static int exynos_drm_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct if (obj->import_attach) return dma_buf_mmap(obj->dma_buf, vma, 0); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); DRM_DEV_DEBUG_KMS(to_dma_dev(obj->dev), "flags = 0x%x\n", exynos_gem->flags); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 8d5a37b8f1100..a9276c8a3e4e7 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -139,7 +139,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) */ vma->vm_ops = &psbfb_vm_ops; vma->vm_private_data = (void *)fb; - vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c index 9fb4dd63342f3..01967dd887625 100644 --- a/drivers/gpu/drm/i810/i810_dma.c +++ b/drivers/gpu/drm/i810/i810_dma.c @@ -102,7 +102,7 @@ static int i810_mmap_buffers(struct file *filp, struct vm_area_struct *vma) buf = dev_priv->mmap_buffer; buf_priv = buf->dev_private; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); buf_priv->currently_mapped = I810_BUF_MAPPED; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 0ad44f3868ded..e95f4c729ca5c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -979,7 +979,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) i915_gem_object_put(obj); return -EINVAL; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } anon = mmap_singleton(to_i915(dev)); @@ -988,7 +988,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) return PTR_ERR(anon); } - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); /* * We keep the ref on mmo->obj, not vm_file, but we require diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c index 47e96b0289f98..28659514bf205 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c @@ -158,7 +158,7 @@ static int mtk_drm_gem_object_mmap(struct drm_gem_object *obj, * dma_alloc_attrs() allocated a struct page table for mtk_gem, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 1dee0d18abbb6..c2fb98a94bc3e 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1012,7 +1012,7 @@ static int msm_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct { struct msm_gem_object *msm_obj = to_msm_bo(obj); - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = msm_gem_pgprot(msm_obj, vm_get_page_prot(vma->vm_flags)); return 0; diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index cf571796fd26e..19fef933904b2 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -543,8 +543,7 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj, { struct omap_gem_object *omap_obj = to_omap_bo(obj); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); if (omap_obj->flags & OMAP_BO_WC) { vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c index 6edb7c52cb3dc..8ea09d915c3ca 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c @@ -251,8 +251,7 @@ static int rockchip_drm_gem_object_mmap(struct drm_gem_object *obj, * We allocated a struct page table for rk_obj, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index 979e7bc902f6a..bce991a2ccc0b 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -574,7 +574,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) * and set the vm_pgoff (used as a fake buffer offset by DRM) * to 0 as we want to map the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; err = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->iova, @@ -588,8 +588,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) } else { pgprot_t prot = vm_get_page_prot(vma->vm_flags); - vma->vm_flags |= VM_MIXEDMAP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(prot); } diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 5a3e4b8913772..c00207582c744 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -468,8 +468,7 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo) vma->vm_private_data = bo; - vma->vm_flags |= VM_PFNMAP; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(ttm_bo_mmap_obj); diff --git a/drivers/gpu/drm/virtio/virtgpu_vram.c b/drivers/gpu/drm/virtio/virtgpu_vram.c index 6b45b0429fef9..25df81c027837 100644 --- a/drivers/gpu/drm/virtio/virtgpu_vram.c +++ b/drivers/gpu/drm/virtio/virtgpu_vram.c @@ -46,7 +46,7 @@ static int virtio_gpu_vram_mmap(struct drm_gem_object *obj, return -EINVAL; vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_set(vma, VM_MIXEDMAP | VM_DONTEXPAND); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); vma->vm_ops = &virtio_gpu_vram_vm_ops; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c index 265f7c48d856e..90097d04b45f3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c @@ -97,7 +97,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma) /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ if (!is_cow_mapping(vma->vm_flags)) - vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; + vm_flags_mod(vma, VM_PFNMAP, VM_MIXEDMAP); ttm_bo_put(bo); /* release extra ref taken by ttm_bo_mmap_obj() */ diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c index 4c95ebcdcc2d3..3ad2b4cfd1f0b 100644 --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c @@ -69,8 +69,7 @@ static int xen_drm_front_gem_object_mmap(struct drm_gem_object *gem_obj, * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map * the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_mod(vma, VM_MIXEDMAP | VM_DONTEXPAND, VM_PFNMAP); vma->vm_pgoff = 0; /* diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c index 8069f795c8649..daa8e1bff5d9d 100644 --- a/drivers/hsi/clients/cmt_speech.c +++ b/drivers/hsi/clients/cmt_speech.c @@ -1264,7 +1264,7 @@ static int cs_char_mmap(struct file *file, struct vm_area_struct *vma) if (vma_pages(vma) != 1) return -EINVAL; - vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTDUMP | VM_DONTEXPAND); vma->vm_ops = &cs_char_vm_ops; vma->vm_private_data = file->private_data; diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index 6c8215a47a601..9621efe0e95c4 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1659,7 +1659,7 @@ static int intel_th_msc_mmap(struct file *file, struct vm_area_struct *vma) atomic_dec(&msc->user_count); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTCOPY; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); vma->vm_ops = &msc_mmap_ops; return ret; } diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 2712e699ba08c..534fbefc7f6aa 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -715,7 +715,7 @@ static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) pm_runtime_get_sync(&stm->dev); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &stm_mmap_vmops; vm_iomap_memory(vma, phys, size); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc162..c6e59bc480f90 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -403,7 +403,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EPERM; goto done; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); addr = vma->vm_start; for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { memlen = uctxt->egrbufs.buffers[i].len; @@ -528,7 +528,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } - vma->vm_flags = flags; + vm_flags_reset(vma, flags); hfi1_cdbg(PROC, "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", ctxt, subctxt, type, mapio, vmf, memaddr, memlen, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c669ef6e47e73..e3c97aa2c46cd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2087,7 +2087,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (!dev->mdev->clock_info) return -EOPNOTSUPP; @@ -2311,7 +2311,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 3937144b2ae5a..80fe92a21f960 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -733,7 +733,7 @@ static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, } /* don't allow them to later change with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; @@ -769,7 +769,7 @@ static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, phys = dd->physaddr + ureg; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, vma->vm_end - vma->vm_start, @@ -810,8 +810,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, * don't allow them to later change to readable with mprotect (for when * not initially mapped readable, as is normally the case) */ - vma->vm_flags &= ~VM_MAYREAD; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND, VM_MAYREAD); /* We used PAT if wc_cookie == 0 */ if (!dd->wc_cookie) @@ -852,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, goto bail; } /* don't allow them to later change to writable with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); start = vma->vm_start; @@ -944,7 +943,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, * Don't allow permission to later change to writable * with mprotect. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } else goto bail; len = vma->vm_end - vma->vm_start; @@ -955,7 +954,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); ret = 1; bail: diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6e8c4fbb80834..6289238cc5af8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -672,7 +672,7 @@ int usnic_ib_mmap(struct ib_ucontext *context, usnic_dbg("\n"); us_ibdev = to_usdev(context->device); - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vfid = vma->vm_pgoff; usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 19176583dbde6..9f54aa90a35a6 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -408,7 +408,7 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) } /* Map UAR to kernel space, VM_LOCKED? */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, start, context->uar.pfn, size, vma->vm_page_prot)) diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index 5f1175f8b3498..205d3cac425cf 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -293,7 +293,7 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma) return ret; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 959b45beb1f3e..a6c6d2fcaaa46 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -185,7 +185,7 @@ static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) /* * Make sure that vm_areas for 2 buffers won't be merged together */ - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index f2c4393595574..4c2ec7a0d804a 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -314,7 +314,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = map; dev_dbg(q->dev, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 234e9f647c963..53001532e8e30 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -630,8 +630,8 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, map->count = 1; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ + /* using shared anonymous pages */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", map, q, vma->vm_start, vma->vm_end, vma->vm_pgoff, first, last); diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index 9b2443720ab01..85c7090606d60 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c @@ -247,7 +247,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index acaa44809c58a..76b5ea66dfa1f 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -220,7 +220,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma) pr_devel("%s: mmio physical: %llx pe: %i master:%i\n", __func__, ctx->psn_phys, ctx->pe , ctx->master); - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &cxl_mmap_vmops; return 0; diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 5e9ae7600d75e..6bb44a3ad5e66 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2082,7 +2082,7 @@ static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, v { struct hl_ts_buff *ts_buff = buf->private; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE); return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0); } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 9f5e208701bad..3b0afdc50ff96 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4236,8 +4236,8 @@ static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index e793fb2bdcbe6..65502ec02bc04 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -5538,8 +5538,8 @@ static int gaudi2_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); #ifdef _HAS_DMA_MMAP_COHERENT @@ -10116,8 +10116,8 @@ static int gaudi2_block_mmap(struct hl_device *hdev, struct vm_area_struct *vma, address = pci_resource_start(hdev->pdev, SRAM_CFG_BAR_ID) + offset_in_bar; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, block_size, vma->vm_page_prot); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 0f083fcf81a6b..2a15a305d01b3 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2880,8 +2880,8 @@ static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index 9eb0d93b01c67..7f83116ae11a6 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -180,7 +180,7 @@ static int check_mmap_afu_irq(struct ocxl_context *ctx, if ((vma->vm_flags & VM_READ) || (vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_WRITE)) return -EINVAL; - vma->vm_flags &= ~(VM_MAYREAD | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYREAD | VM_MAYEXEC); return 0; } @@ -204,7 +204,7 @@ int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma) if (rc) return rc; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxl_vmops; return 0; diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c index 25c78df8055d3..405180d47d9bf 100644 --- a/drivers/misc/ocxl/sysfs.c +++ b/drivers/misc/ocxl/sysfs.c @@ -134,7 +134,7 @@ static int global_mmio_mmap(struct file *filp, struct kobject *kobj, (afu->config.global_mmio_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &global_mmio_vmops; vma->vm_private_data = afu; diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index 9dda47b3fd709..7be4e6c9f1209 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -95,12 +95,12 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; /* Ensure userspace cannot acquire VM_WRITE later. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYSHARE); } /* Create write-combine mapping so all clients observe a wipe. */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTDUMP); return vm_iomap_memory(vma, drvdata->rmem->base, drvdata->rmem->size); } diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index 7ffcfc0bb5872..a3d659c11cc4e 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -101,8 +101,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | - VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_LOCKED | + VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = PAGE_SHARED; vma->vm_ops = &gru_vm_ops; diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index 905eff1f840ed..b65ab440a19e8 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -229,7 +229,7 @@ static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma) if (!qfr) return -ENOMEM; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK); vma->vm_ops = &uacce_vm_ops; vma->vm_private_data = q; qfr->type = type; diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index 21b7cb6e7e705..e300cf26bc2a8 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -389,7 +389,7 @@ static int dax_devmap(struct file *f, struct vm_area_struct *vma) /* completion area is mapped read-only for user */ if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (remap_pfn_range(vma, vma->vm_start, ctx->ca_buf_ra >> PAGE_SHIFT, len, vma->vm_page_prot)) diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c index 631eda2d467e2..6542818e595a6 100644 --- a/drivers/scsi/cxlflash/ocxl_hw.c +++ b/drivers/scsi/cxlflash/ocxl_hw.c @@ -1167,7 +1167,7 @@ static int afu_mmap(struct file *file, struct vm_area_struct *vma) (ctx->psn_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxlflash_vmops; return 0; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ff9854f599649..a91049213203f 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1288,7 +1288,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) } sfp->mmap_called = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; out: diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 5e53eed8ae951..095cd0ba8c216 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -1072,7 +1072,7 @@ int hmm_bo_mmap(struct vm_area_struct *vma, struct hmm_buffer_object *bo) vma->vm_private_data = bo; vma->vm_ops = &hmm_bo_vm_ops; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); /* * call hmm_bo_vm_open explicitly. diff --git a/drivers/staging/media/deprecated/meye/meye.c b/drivers/staging/media/deprecated/meye/meye.c index 5d87efd9b95c6..746c6ea1c0a7a 100644 --- a/drivers/staging/media/deprecated/meye/meye.c +++ b/drivers/staging/media/deprecated/meye/meye.c @@ -1476,8 +1476,8 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &meye_vm_ops; - vma->vm_flags &= ~VM_IO; /* not I/O memory */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + /* not I/O memory */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = (void *) (offset / gbufsize); meye_vm_open(vma); diff --git a/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c b/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c index 787edb3d47c23..a1b7ad350a90b 100644 --- a/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c +++ b/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c @@ -779,7 +779,7 @@ static int v4l_stk_mmap(struct file *fp, struct vm_area_struct *vma) ret = remap_vmalloc_range(vma, sbuf->buffer, 0); if (ret) return ret; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = sbuf; vma->vm_ops = &stk_v4l_vm_ops; sbuf->v4lbuf.flags |= V4L2_BUF_FLAG_MAPPED; diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 2940559c30860..15ffc8d2ac7ba 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1928,7 +1928,7 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &tcmu_vm_ops; vma->vm_private_data = udev; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 43afbb7c5ab91..62082d64ece00 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -713,7 +713,7 @@ static const struct vm_operations_struct uio_logical_vm_ops = { static int uio_mmap_logical(struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &uio_logical_vm_ops; return 0; } diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 837f3e57f5809..e501a03d6c700 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -279,8 +279,7 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) } } - vma->vm_flags |= VM_IO; - vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP); + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &usbdev_vm_ops; vma->vm_private_data = usbm; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 094e812e9e692..abb1cd35d8a67 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1272,8 +1272,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index e682bc7ee6c99..5e4a77b9bae6b 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -512,7 +512,7 @@ static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) { struct vduse_iova_domain *domain = file->private_data; - vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND); vma->vm_private_data = domain; vma->vm_ops = &vduse_domain_mmap_ops; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 26a541cc64d11..c49f8f2b2865c 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1799,7 +1799,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vfio_pci_mmap_ops; return 0; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index ec32f785dfdec..9c5010ee20da9 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1315,7 +1315,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vhost_vdpa_vm_ops; return 0; } diff --git a/drivers/video/fbdev/68328fb.c b/drivers/video/fbdev/68328fb.c index 7db03ed77c762..41df61b37a18d 100644 --- a/drivers/video/fbdev/68328fb.c +++ b/drivers/video/fbdev/68328fb.c @@ -391,7 +391,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) #ifndef MMU /* this is uClinux (no MMU) specific code */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_start = videomemory; return 0; diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index c730253ab85ce..dc310c7b5769e 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -232,9 +232,9 @@ static const struct address_space_operations fb_deferred_io_aops = { int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_private_data = info; return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index a15729beb9d1b..26ffb8755ffb5 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -525,7 +525,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 4d9a3050de6a3..61faea1f06630 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1055,10 +1055,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; if (map->flags) { diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index dd5bbb6e1b6b9..2fa10ca5be14c 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -156,7 +156,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) vma_priv->file_priv = file_priv; vma_priv->users = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND); vma->vm_ops = &privcmd_buf_vm_ops; vma->vm_private_data = vma_priv; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1edf45ee9890d..e2f580e30a861 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -934,8 +934,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; diff --git a/fs/aio.c b/fs/aio.c index e85ba0b77f596..b0b17bd098bbc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -394,7 +394,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 50e4e060db681..45a65c400bd0e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -408,7 +408,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) * unpopulated ptes via cramfs_read_folio(). */ int i; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7f..f32d659875783 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -429,7 +429,7 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &erofs_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } #else diff --git a/fs/exec.c b/fs/exec.c index c0df813d2b455..d2e2a15e5cfe6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -270,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); @@ -834,7 +834,7 @@ int setup_arg_pages(struct linux_binprm *bprm, } /* mprotect_fixup is overkill to remove the temporary stack flags */ - vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7ac0a81bd3713..6bdf61a62c796 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -801,7 +801,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e23e802a80130..8e74f278a3f69 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -860,7 +860,7 @@ int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &fuse_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE); return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 43af1753de5f2..cfd09f95551b8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -132,7 +132,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_future_write(info->seals, vma); @@ -811,7 +811,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as input to create an allocation policy. */ vma_init(&pseudo_vma, mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9a..a5e1ea8b71193 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -389,8 +389,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - vma->vm_flags |= VM_SEQ_READ; - vma->vm_flags &= ~VM_RAND_READ; + vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); file_accessed(file); vma->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a944e1816364d..6a96e1713fd58 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1299,7 +1299,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 09a81e4b12738..12af614f33ce4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -582,8 +582,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f3c75c6222de6..44d1ee429eb0c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -113,7 +113,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, { const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - vma->vm_flags = flags; + vm_flags_reset(vma, flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 595a5bcf46b94..b0039a8fea2ef 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1429,7 +1429,7 @@ xfs_file_mmap( file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index fa952ca3b8ed0..1f6345191cc2f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3659,7 +3659,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) * VM_MAYWRITE as we still want them to be COW-writable. */ if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); + vm_flags_clear(vma, VM_MAYWRITE); } return 0; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 80f4b4d88aafa..8732e0aadf366 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma */ return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64131f88c5537..9f56b442daa95 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -882,10 +882,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; - vma->vm_flags &= ~VM_MAYEXEC; + vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e9..55a82f12a42c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6573,7 +6573,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) diff --git a/kernel/kcov.c b/kernel/kcov.c index e5cd09fd8a050..84c717337df08 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) goto exit; } spin_unlock_irqrestore(&kcov->lock, flags); - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); res = vm_insert_page(vma, vma->vm_start + off, page); diff --git a/kernel/relay.c b/kernel/relay.c index ef12532168d9c..9aa70ae53d24d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = buf; return 0; diff --git a/mm/madvise.c b/mm/madvise.c index ca672e37b38c1..5a5a687d03c24 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -176,7 +176,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, /* * vm_flags is protected by the mmap_lock held in write mode. */ - vma->vm_flags = new_flags; + vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) diff --git a/mm/memory.c b/mm/memory.c index ec833a2e06017..a6316cda0e875 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1928,7 +1928,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); @@ -1986,7 +1986,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot); } @@ -2452,7 +2452,7 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index 5c4fff93cd6bc..ed49459e343e5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); } } @@ -457,7 +457,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } diff --git a/mm/mmap.c b/mm/mmap.c index 03d7c37c59691..33c638c7ec046 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2555,7 +2555,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -2683,7 +2683,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * then new mapped in-place (which must be aimed as * a completely new data area). */ - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); @@ -2909,7 +2909,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vma_prepare(&vp); vma->vm_end = addr + len; - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, mm); @@ -2926,7 +2926,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = addr >> PAGE_SHIFT; - vma->vm_flags = flags; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2938,7 +2938,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); validate_mm(mm); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index cce6a0e58fb50..1d4843c97c2a1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -670,7 +670,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); diff --git a/mm/mremap.c b/mm/mremap.c index d70d8063c6e20..411a85682b580 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -662,7 +662,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { - vma->vm_flags &= ~VM_ACCOUNT; + vm_flags_clear(vma, VM_ACCOUNT); if (vma->vm_start < old_addr) account_start = vma->vm_start; if (vma->vm_end > old_addr + old_len) @@ -719,12 +719,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (account_start) { vma = vma_prev(&vmi); - vma->vm_flags |= VM_ACCOUNT; + vm_flags_set(vma, VM_ACCOUNT); } if (account_end) { vma = vma_next(&vmi); - vma->vm_flags |= VM_ACCOUNT; + vm_flags_set(vma, VM_ACCOUNT); } return new_addr; diff --git a/mm/nommu.c b/mm/nommu.c index 9a166738909ea..57ba243c6a37f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -173,7 +173,7 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) mmap_write_lock(current->mm); vma = find_vma(current->mm, (unsigned long)ret); if (vma) - vma->vm_flags |= VM_USERMAP; + vm_flags_set(vma, VM_USERMAP); mmap_write_unlock(current->mm); } @@ -950,7 +950,8 @@ static int do_mmap_private(struct vm_area_struct *vma, atomic_long_add(total, &mmap_pages_allocated); - region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); + region->vm_flags = vma->vm_flags; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; region->vm_top = region->vm_start + (total << PAGE_SHIFT); @@ -1047,7 +1048,7 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_pgoff = pgoff; if (file) { @@ -1111,7 +1112,7 @@ unsigned long do_mmap(struct file *file, vma->vm_end = start + len; if (pregion->vm_flags & VM_MAPPED_COPY) - vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); else { ret = do_mmap_shared_file(vma); if (ret < 0) { @@ -1601,7 +1602,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/secretmem.c b/mm/secretmem.c index be3fff86ba00a..8453ada8f41d4 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -128,7 +128,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 9e1015cbad29f..732969afabd11 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2304,7 +2304,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return ret; /* arm64 - allow memory tagging on RAM-based files */ - vma->vm_flags |= VM_MTE_ALLOWED; + vm_flags_set(vma, VM_MTE_ALLOWED); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9b71ec3213cbb..ff4d7dfdf84ad 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3596,7 +3596,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, size -= PAGE_SIZE; } while (size > 0); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f713c0422f0fe..7db45cdc3e1a1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1890,10 +1890,10 @@ int tcp_mmap(struct file *file, struct socket *sock, { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 0a6894cdc54d9..18498979a6409 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -262,7 +262,7 @@ static int sel_mmap_handle_status(struct file *filp, if (vma->vm_flags & VM_WRITE) return -EPERM; /* disallow mprotect() turns it into writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return remap_pfn_range(vma, vma->vm_start, page_to_pfn(status), @@ -506,13 +506,13 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) { /* do not allow mprotect to make mapping writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (vma->vm_flags & VM_WRITE) return -EACCES; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &sel_mmap_policy_ops; return 0; diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ac2efeb63a396..728c211142d14 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -2910,7 +2910,7 @@ static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area) } /* set VM_READ access as well to fix memset() routines that do reads before writes (to improve performance) */ - area->vm_flags |= VM_READ; + vm_flags_set(area, VM_READ); if (substream == NULL) return -ENXIO; runtime = substream->runtime; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 9c122e757efe8..331380c2438bc 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3675,8 +3675,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - area->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_mod(area, VM_DONTEXPAND | VM_DONTDUMP, + VM_WRITE | VM_MAYWRITE); + return 0; } @@ -3712,7 +3713,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); return 0; } @@ -3828,7 +3829,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); if (!substream->ops->page && !snd_dma_buffer_mmap(snd_pcm_get_dma_buf(substream), area)) return 0; diff --git a/sound/soc/pxa/mmp-sspa.c b/sound/soc/pxa/mmp-sspa.c index fb5a4390443fe..b3c1744eff91a 100644 --- a/sound/soc/pxa/mmp-sspa.c +++ b/sound/soc/pxa/mmp-sspa.c @@ -404,7 +404,7 @@ static int mmp_pcm_mmap(struct snd_soc_component *component, struct snd_pcm_substream *substream, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, vma->vm_start, substream->dma_buffer.addr >> PAGE_SHIFT, diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index e558931cce16e..709ccad972e2f 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -224,9 +224,9 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, } area->vm_ops = &usb_stream_hwdep_vm_ops; - area->vm_flags |= VM_DONTDUMP; + vm_flags_set(area, VM_DONTDUMP); if (!read) - area->vm_flags |= VM_DONTEXPAND; + vm_flags_set(area, VM_DONTEXPAND); area->vm_private_data = us122l; atomic_inc(&us122l->mmap_count); out: diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index c29da0341bc5b..4937ede0b5d70 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -61,7 +61,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep *hw, struct file *filp, struct vm } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 767a227d54da4..36f2e31168fb0 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -706,7 +706,7 @@ static int snd_usx2y_hwdep_pcm_mmap(struct snd_hwdep *hw, struct file *filp, str return -ENODEV; area->vm_ops = &snd_usx2y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } From 102a94d10b53b95a5c21d5657b60c9593e432a9f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jan 2023 12:50:59 -0800 Subject: [PATCH 095/143] mm-replace-vma-vm_flags-direct-modifications-with-modifier-calls-fix fix drivers/misc/open-dice.c, per Hyeonggon Yoo Cc: Suren Baghdasaryan Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- drivers/misc/open-dice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index 7be4e6c9f1209..8aea2d070a40c 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -95,7 +95,7 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; /* Ensure userspace cannot acquire VM_WRITE later. */ - vm_flags_clear(vma, VM_MAYSHARE); + vm_flags_clear(vma, VM_MAYWRITE); } /* Create write-combine mapping so all clients observe a wipe. */ From 918e95b37d4e3ca375319d75aa2f0790e03304dc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:50 -0800 Subject: [PATCH 096/143] mm: replace vma->vm_flags indirect modification in ksm_madvise Replace indirect modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. Link: https://lkml.kernel.org/r/20230126193752.297968-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Michael Ellerman [powerpc] Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 6 +++++- arch/s390/mm/gmap.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 1d67baa5557a2..709ebd578394b 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, { unsigned long gfn = memslot->base_gfn; unsigned long end, start = gfn_to_hva(kvm, gfn); + unsigned long vm_flags; int ret = 0; struct vm_area_struct *vma; int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE; @@ -409,12 +410,15 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - merge_flag, &vma->vm_flags); + merge_flag, &vm_flags); if (ret) { ret = H_STATE; break; } + vm_flags_reset(vma, vm_flags); start = vma->vm_end; } while (end > vma->vm_end); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ab836597419dc..5a716bdcba05b 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2587,14 +2587,18 @@ int gmap_mark_unmergeable(void) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + unsigned long vm_flags; int ret; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); + MADV_UNMERGEABLE, &vm_flags); if (ret) return ret; + vm_flags_reset(vma, vm_flags); } mm->def_flags &= ~VM_MERGEABLE; return 0; From e4bef15134f76febb51a575cc7095c2df7fb93c9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:51 -0800 Subject: [PATCH 097/143] mm: introduce __vm_flags_mod and use it in untrack_pfn There are scenarios when vm_flags can be modified without exclusive mmap_lock, such as: - after VMA was isolated and mmap_lock was downgraded or dropped - in exit_mmap when there are no other mm users and locking is unnecessary Introduce __vm_flags_mod to avoid assertions when the caller takes responsibility for the required locking. Pass a hint to untrack_pfn to conditionally use __vm_flags_mod for flags modification to avoid assertion. Link: https://lkml.kernel.org/r/20230126193752.297968-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/mm/pat/memtype.c | 10 +++++++--- include/linux/mm.h | 14 ++++++++++++-- include/linux/pgtable.h | 5 +++-- mm/memory.c | 13 +++++++------ mm/memremap.c | 4 ++-- mm/mmap.c | 16 ++++++++++------ 6 files changed, 41 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 6ca51b1aa5d98..691bf8934b6fe 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) + unsigned long size, bool mm_wr_locked) { resource_size_t paddr; unsigned long prot; @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - if (vma) - vm_flags_clear(vma, VM_PAT); + if (vma) { + if (mm_wr_locked) + vm_flags_clear(vma, VM_PAT); + else + __vm_flags_mod(vma, 0, VM_PAT); + } } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f6345191cc2f..cab10a084e03b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -662,6 +662,16 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. @@ -670,7 +680,7 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { mmap_assert_write_locked(vma->vm_mm); - vm_flags_init(vma, (vma->vm_flags | set) & ~clear); + __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) @@ -2091,7 +2101,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); struct mmu_notifier_range; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5fd45454c073f..c63cd44777ec1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long size, + bool mm_wr_locked) { } @@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); + unsigned long size, bool mm_wr_locked); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif diff --git a/mm/memory.c b/mm/memory.c index a6316cda0e875..7a04a1130ec15 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1613,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) + struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -1628,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0); + untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -1675,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr) + unsigned long end_addr, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { @@ -1689,7 +1689,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { - unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + unmap_single_vma(tlb, vma, start_addr, end_addr, &details, + mm_wr_locked); } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1723,7 +1724,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details); + unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -2492,7 +2493,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/memremap.c b/mm/memremap.c index 08cbf54fe0370..2f88f43d4a017 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -129,7 +129,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); pgmap_array_delete(range); } @@ -276,7 +276,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); err_pfn_remap: pgmap_array_delete(range); return error; diff --git a/mm/mmap.c b/mm/mmap.c index 33c638c7ec046..20f21f0949ddb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -78,7 +78,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -2133,14 +2133,14 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool mm_wr_locked) { struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mt, vma, start, end); + unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); @@ -2388,7 +2388,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade); /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); @@ -2701,7 +2705,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Undo any partial mapping done by a device driver. */ unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, - vma->vm_end); + vma->vm_end, true); } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); @@ -3029,7 +3033,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false); mmap_read_unlock(mm); /* From 3518493d1444783db3195b9499415de73d680b07 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:52 -0800 Subject: [PATCH 098/143] mm: export dump_mm() mmap_assert_write_locked() is used in vm_flags modifiers. Because mmap_assert_write_locked() uses dump_mm() and vm_flags are sometimes modified from inside a module, it's necessary to export dump_mm() function. Link: https://lkml.kernel.org/r/20230126193752.297968-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/debug.c b/mm/debug.c index 9d3d893dc7f4e..96d594e162925 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm) mm->def_flags, &mm->def_flags ); } +EXPORT_SYMBOL(dump_mm); static bool page_init_poisoning __read_mostly = true; From d027af6580f8fe35641071da9dd747a1d3319d19 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Sun, 29 Jan 2023 10:14:35 +0800 Subject: [PATCH 099/143] kasan: infer allocation size by scanning metadata Make KASAN scan metadata to infer the requested allocation size instead of printing cache->object_size. This patch fixes confusing slab-out-of-bounds reports as reported in: https://bugzilla.kernel.org/show_bug.cgi?id=216457 As an example of the confusing behavior, the report below hints that the allocation size was 192, while the kernel actually called kmalloc(184): ================================================================== BUG: KASAN: slab-out-of-bounds in _find_next_bit+0x143/0x160 lib/find_bit.c:109 Read of size 8 at addr ffff8880175766b8 by task kworker/1:1/26 ... The buggy address belongs to the object at ffff888017576600 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 184 bytes inside of 192-byte region [ffff888017576600, ffff8880175766c0) ... Memory state around the buggy address: ffff888017576580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888017576600: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff888017576680: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc ^ ffff888017576700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888017576780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== With this patch, the report shows: ================================================================== ... The buggy address belongs to the object at ffff888017576600 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 0 bytes to the right of allocated 184-byte region [ffff888017576600, ffff8880175766b8) ... ================================================================== Also report slab use-after-free bugs as "slab-use-after-free" and print "freed" instead of "allocated" in the report when describing the accessed memory region. Also improve the metadata-related comment in kasan_find_first_bad_addr and use addr_has_metadata across KASAN code instead of open-coding KASAN_SHADOW_START checks. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216457 Link: https://lkml.kernel.org/r/20230129021437.18812-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Co-developed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Chinwen Chang Cc: Dmitry Vyukov Cc: Matthias Brugger Cc: Qun-Wei Lin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 4 +--- mm/kasan/kasan.h | 2 ++ mm/kasan/report.c | 41 ++++++++++++++++++++++++++++----------- mm/kasan/report_generic.c | 32 +++++++++++++++++++++++++++++- mm/kasan/report_hw_tags.c | 35 ++++++++++++++++++++++++++++++++- mm/kasan/report_sw_tags.c | 26 +++++++++++++++++++++++++ mm/kasan/report_tags.c | 2 +- mm/kasan/sw_tags.c | 6 ++---- 8 files changed, 127 insertions(+), 21 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index cb762982c8baf..e5eef670735ef 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -172,10 +172,8 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata((void *)addr))) return !kasan_report(addr, size, write, ret_ip); - } if (likely(!memory_is_poisoned(addr, size))) return true; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 32413f22aa828..308fb70fd40a5 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -207,6 +207,7 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + size_t alloc_size; /* Filled in by the mode-specific reporting code. */ const char *bug_type; @@ -323,6 +324,7 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *kasan_find_first_bad_addr(void *addr, size_t size); +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 22598b20c7b75..e0492124e90a7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -231,33 +231,46 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(const void *addr, struct kmem_cache *cache, - void *object) +static void describe_object_addr(const void *addr, struct kasan_report_info *info) { unsigned long access_addr = (unsigned long)addr; - unsigned long object_addr = (unsigned long)object; - const char *rel_type; + unsigned long object_addr = (unsigned long)info->object; + const char *rel_type, *region_state = ""; int rel_bytes; pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", - object, cache->name, cache->object_size); + info->object, info->cache->name, info->cache->object_size); if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; - } else if (access_addr >= object_addr + cache->object_size) { + } else if (access_addr >= object_addr + info->alloc_size) { rel_type = "to the right"; - rel_bytes = access_addr - (object_addr + cache->object_size); + rel_bytes = access_addr - (object_addr + info->alloc_size); } else { rel_type = "inside"; rel_bytes = access_addr - object_addr; } + /* + * Tag-Based modes use the stack ring to infer the bug type, but the + * memory region state description is generated based on the metadata. + * Thus, defining the region state as below can contradict the metadata. + * Fixing this requires further improvements, so only infer the state + * for the Generic mode. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) { + if (strcmp(info->bug_type, "slab-out-of-bounds") == 0) + region_state = "allocated "; + else if (strcmp(info->bug_type, "slab-use-after-free") == 0) + region_state = "freed "; + } + pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%px, %px)\n", - rel_bytes, rel_type, cache->object_size, (void *)object_addr, - (void *)(object_addr + cache->object_size)); + " %s%lu-byte region [%px, %px)\n", + rel_bytes, rel_type, region_state, info->alloc_size, + (void *)object_addr, (void *)(object_addr + info->alloc_size)); } static void describe_object_stacks(struct kasan_report_info *info) @@ -279,7 +292,7 @@ static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) describe_object_stacks(info); - describe_object_addr(addr, info->cache, info->object); + describe_object_addr(addr, info); } static inline bool kernel_or_module_addr(const void *addr) @@ -436,6 +449,12 @@ static void complete_report_info(struct kasan_report_info *info) if (slab) { info->cache = slab->slab_cache; info->object = nearest_obj(info->cache, slab, addr); + + /* Try to determine allocation size based on the metadata. */ + info->alloc_size = kasan_get_alloc_size(info->object, info->cache); + /* Fallback to the object size if failed. */ + if (!info->alloc_size) + info->alloc_size = info->cache->object_size; } else info->cache = info->object = NULL; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 043c94b046054..87d39bc0a6735 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -43,6 +43,34 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow == 0) + size += KASAN_GRANULE_SIZE; + else if (*shadow >= 1 && *shadow <= KASAN_GRANULE_SIZE - 1) + return size + *shadow; + else + return size; + shadow++; + } + + return cache->object_size; +} + static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -79,9 +107,11 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info) bug_type = "stack-out-of-bounds"; break; case KASAN_PAGE_FREE: + bug_type = "use-after-free"; + break; case KASAN_SLAB_FREE: case KASAN_SLAB_FREETRACK: - bug_type = "use-after-free"; + bug_type = "slab-use-after-free"; break; case KASAN_ALLOCA_LEFT: case KASAN_ALLOCA_RIGHT: diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index f3d3be614e4b0..32e80f78de7d0 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,10 +17,43 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { - /* Return the same value regardless of whether addr_has_metadata(). */ + /* + * Hardware Tag-Based KASAN only calls this function for normal memory + * accesses, and thus addr points precisely to the first bad address + * with an invalid (and present) memory tag. Therefore: + * 1. Return the address as is without walking memory tags. + * 2. Skip the addr_has_metadata check. + */ return kasan_reset_tag(addr); } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + int i = 0; + u8 memory_tag; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + while (size < cache->object_size) { + memory_tag = hw_get_mem_tag(object + i * KASAN_GRANULE_SIZE); + if (memory_tag != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + i++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { int i; diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 7a26397297edb..8b1f5a73ee6d3 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -45,6 +45,32 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + shadow++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index ecede06ef374a..8b8bfdb3cfdb5 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -89,7 +89,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * a use-after-free. */ if (!info->bug_type) - info->bug_type = "use-after-free"; + info->bug_type = "slab-use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index a3afaf2ad1b11..30da65fa02a1e 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -106,10 +106,8 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return true; untagged_addr = kasan_reset_tag((const void *)addr); - if (unlikely(untagged_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata(untagged_addr))) return !kasan_report(addr, size, write, ret_ip); - } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { @@ -127,7 +125,7 @@ bool kasan_byte_accessible(const void *addr) void *untagged_addr = kasan_reset_tag(addr); u8 shadow_byte; - if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + if (!addr_has_metadata(untagged_addr)) return false; shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); From 9f9f88e926385018647972fade6b2f5ce1636116 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Feb 2023 14:43:48 -0800 Subject: [PATCH 100/143] kasan-infer-allocation-size-by-scanning-metadata-fix fix printk warning Reported-by: kernel test robot Link: https://lkml.kernel.org/r/202302021325.700zGa0M-lkp@intel.com Cc: Kuan-Ying Lee Cc: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- mm/kasan/report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e0492124e90a7..89078f912827c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -268,7 +268,7 @@ static void describe_object_addr(const void *addr, struct kasan_report_info *inf } pr_err("The buggy address is located %d bytes %s of\n" - " %s%lu-byte region [%px, %px)\n", + " %s%zu-byte region [%px, %px)\n", rel_bytes, rel_type, region_state, info->alloc_size, (void *)object_addr, (void *)(object_addr + info->alloc_size)); } From 001023d04ec3257b300f19252f1047f7ea4db34c Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Sun, 29 Jan 2023 04:06:51 +0000 Subject: [PATCH 101/143] memory tier: release the new_memtier in find_create_memory_tier() In find_create_memory_tier(), if failed to register device, then we should release new_memtier from the tier list and put device instead of memtier. Link: https://lkml.kernel.org/r/20230129040651.1329208-1-tongtiangen@huawei.com Fixes: 9832fb87834e ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: Tong Tiangen Cc: Aneesh Kumar K.V Cc: Hanjun Guo Cc: Kefeng Wang Cc: Guohanjun Cc: Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index c734658c62424..e593e56e530b7 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -211,8 +211,8 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty ret = device_register(&new_memtier->dev); if (ret) { - list_del(&memtier->list); - put_device(&memtier->dev); + list_del(&new_memtier->list); + put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; From 7efc74e73b768f46aa1c43e277ac318e332f205e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:32 +0200 Subject: [PATCH 102/143] arm: include asm-generic/memory_model.h from page.h rather than memory.h Patch series "mm, arch: add generic implementation of pfn_valid() for FLATMEM", v2. Every architecture that supports FLATMEM memory model defines its own version of pfn_valid() that essentially compares a pfn to max_mapnr. Use mips/powerpc version implemented as static inline as a generic implementation of pfn_valid() and drop its per-architecture definitions This patch (of 4): Makes it consistent with other architectures and allows for generic definition of pfn_valid() in asm-generic/memory_model.h with clear override in arch/arm/include/asm/page.h Link: https://lkml.kernel.org/r/20230129124235.209895-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20230129124235.209895-2-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/include/asm/memory.h | 2 -- arch/arm/include/asm/page.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index d8eef4bd8c711..62e9df0244457 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -386,6 +386,4 @@ static inline unsigned long __virt_to_idmap(unsigned long x) #endif -#include - #endif diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 5fcc8a600e36d..74bb5947b387a 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -158,6 +158,7 @@ typedef struct page *pgtable_t; #ifdef CONFIG_HAVE_ARCH_PFN_VALID extern int pfn_valid(unsigned long); +#define pfn_valid pfn_valid #endif #include @@ -167,5 +168,6 @@ extern int pfn_valid(unsigned long); #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC #include +#include #endif From acc171f1b3ebe31520c0f647137c5aafe11c59f2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:33 +0200 Subject: [PATCH 103/143] m68k: use asm-generic/memory_model.h for both MMU and !MMU The MMU variant uses generic definitions of page_to_pfn() and pfn_to_page(), but !MMU defines them in include/asm/page_no.h for no good reason. Include asm-generic/memory_model.h in the common include/asm/page.h and drop redundant definitions. Link: https://lkml.kernel.org/r/20230129124235.209895-3-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/m68k/include/asm/page.h | 6 +----- arch/m68k/include/asm/page_mm.h | 1 - arch/m68k/include/asm/page_no.h | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h index 2f1c54e4725d4..a5993ad83ed8a 100644 --- a/arch/m68k/include/asm/page.h +++ b/arch/m68k/include/asm/page.h @@ -62,11 +62,7 @@ extern unsigned long _ramend; #include #endif -#ifndef CONFIG_MMU -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) -#define __pfn_to_phys(pfn) PFN_PHYS(pfn) -#endif - #include +#include #endif /* _M68K_PAGE_H */ diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index a5b459bcb7d81..3903db2e8da77 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -134,7 +134,6 @@ extern int m68k_virt_to_node_shift; }) #define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) -#include #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) #define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index abd2c3aeb0151..83d345f482bdd 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -25,8 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_to_page(pfn) virt_to_page(pfn_to_virt(pfn)) -#define page_to_pfn(page) virt_to_pfn(page_to_virt(page)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ From a23c440838c7d689d50a47918871f68d15d7edaf Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:34 +0200 Subject: [PATCH 104/143] mips: drop definition of pfn_valid() for DISCONTIGMEM There is a stale definition of pfn_valid() for DISCONTINGMEM memory model guarded !FLATMEM && !SPARSEMEM && NUMA ifdefery. Remove everything but definition of pfn_valid() for FLATMEM. Link: https://lkml.kernel.org/r/20230129124235.209895-4-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/mips/include/asm/page.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 96bc798c1ec1c..9286f11ff6adf 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -235,21 +235,6 @@ static inline int pfn_valid(unsigned long pfn) return pfn >= pfn_offset && pfn < max_mapnr; } -#elif defined(CONFIG_SPARSEMEM) - -/* pfn_valid is defined in linux/mmzone.h */ - -#elif defined(CONFIG_NUMA) - -#define pfn_valid(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - int __n = pfn_to_nid(__pfn); \ - ((__n >= 0) ? (__pfn < NODE_DATA(__n)->node_start_pfn + \ - NODE_DATA(__n)->node_spanned_pages) \ - : 0); \ -}) - #endif #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) From bb2acf787168b72d53e9a289e820cd264bbe5c61 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:35 +0200 Subject: [PATCH 105/143] mm, arch: add generic implementation of pfn_valid() for FLATMEM Every architecture that supports FLATMEM memory model defines its own version of pfn_valid() that essentially compares a pfn to max_mapnr. Use mips/powerpc version implemented as static inline as a generic implementation of pfn_valid() and drop its per-architecture definitions. Link: https://lkml.kernel.org/r/20230129124235.209895-5-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: Arnd Bergmann Acked-by: Guo Ren [csky] Acked-by: Huacai Chen [LoongArch] Acked-by: Stafford Horne [OpenRISC] Acked-by: Michael Ellerman [powerpc] Reviewed-by: David Hildenbrand Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Helge Deller Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 4 ---- arch/arc/include/asm/page.h | 1 - arch/csky/include/asm/page.h | 1 - arch/hexagon/include/asm/page.h | 1 - arch/ia64/include/asm/page.h | 4 ---- arch/loongarch/include/asm/page.h | 13 ------------- arch/m68k/include/asm/page_no.h | 2 -- arch/microblaze/include/asm/page.h | 1 - arch/mips/include/asm/page.h | 13 ------------- arch/nios2/include/asm/page.h | 9 --------- arch/openrisc/include/asm/page.h | 2 -- arch/parisc/include/asm/page.h | 4 ---- arch/powerpc/include/asm/page.h | 9 --------- arch/riscv/include/asm/page.h | 5 ----- arch/sh/include/asm/page.h | 3 --- arch/sparc/include/asm/page_32.h | 1 - arch/um/include/asm/page.h | 1 - arch/x86/include/asm/page_32.h | 4 ---- arch/x86/include/asm/page_64.h | 4 ---- arch/xtensa/include/asm/page.h | 4 ++-- include/asm-generic/memory_model.h | 12 ++++++++++++ include/asm-generic/page.h | 2 -- 22 files changed, 14 insertions(+), 86 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index bc5256fba8f04..4db1ebc0ed99c 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -86,10 +86,6 @@ typedef struct page *pgtable_t; #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid((__pa(kaddr) >> PAGE_SHIFT)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include #include diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9a62e1d87967f..e43fe27ec54d5 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -109,7 +109,6 @@ extern int pfn_valid(unsigned long pfn); #else /* CONFIG_HIGHMEM */ #define ARCH_PFN_OFFSET virt_to_pfn(CONFIG_LINUX_RAM_BASE) -#define pfn_valid(pfn) (((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #endif /* CONFIG_HIGHMEM */ diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h index ed7451478b1b0..b23e3006a9e06 100644 --- a/arch/csky/include/asm/page.h +++ b/arch/csky/include/asm/page.h @@ -39,7 +39,6 @@ #define virt_addr_valid(kaddr) ((void *)(kaddr) >= (void *)PAGE_OFFSET && \ (void *)(kaddr) < high_memory) -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) extern void *memset(void *dest, int c, size_t l); extern void *memcpy(void *to, const void *from, size_t l); diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index d7d4f9fca3279..9c03b9965f07a 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -95,7 +95,6 @@ struct page; /* Default vm area behavior is non-executable. */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) /* Need to not use a define for linesize; may move this to another file. */ diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index ba0b365cf2b2b..310b09c3342d6 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -95,10 +95,6 @@ do { \ #include -#ifdef CONFIG_FLATMEM -# define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 53f284a961823..fb5338b352e65 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -82,19 +82,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 83d345f482bdd..43ff6b109ebbb 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -25,8 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ ((unsigned long)(kaddr) < memory_end)) diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 4b8b2fa78fc5f..7b9861bcd4581 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,7 +112,6 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 9286f11ff6adf..5978a8dfb917b 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -224,19 +224,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 6a989819a7c1d..0ae7d9ce369b9 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -86,15 +86,6 @@ extern struct page *mem_map; # define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -static inline bool pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index aab6e64d6db42..52b0d7e764461 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -80,8 +80,6 @@ typedef struct page *pgtable_t; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (pfn_valid(virt_to_pfn(kaddr))) #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 6faaaa3ebe9b8..667e703c0e8f6 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -155,10 +155,6 @@ extern int npmem_ranges; #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_SPARSEMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PMD_SHIFT /* fixed for transparent huge pages */ #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index edf1dd1b0ca99..f2b6bf5687d0e 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -117,15 +117,6 @@ extern long long virt_phys_offset; #ifdef CONFIG_FLATMEM #define ARCH_PFN_OFFSET ((unsigned long)(MEMORY_START >> PAGE_SHIFT)) -#ifndef __ASSEMBLY__ -extern unsigned long max_mapnr; -static inline bool pfn_valid(unsigned long pfn) -{ - unsigned long min_pfn = ARCH_PFN_OFFSET; - - return pfn >= min_pfn && pfn < max_mapnr; -} -#endif #endif #define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 9f432c1b52899..7fed7c431928b 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -171,11 +171,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) \ - (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr)) -#endif - #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) ({ \ diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index eca5daa43b93d..09ac6c7faee0f 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -169,9 +169,6 @@ typedef struct page *pgtable_t; #define PFN_START (__MEMORY_START >> PAGE_SHIFT) #define ARCH_PFN_OFFSET (PFN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) >= min_low_pfn && (pfn) < max_low_pfn) -#endif #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index fff8861df107a..6be6f683f98f2 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -130,7 +130,6 @@ extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr)) #define virt_addr_valid(kaddr) ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT) < max_mapnr) #include diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index cdbd9653aa14e..84866127d0747 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -108,7 +108,6 @@ extern unsigned long uml_physmem; #define phys_to_pfn(p) ((p) >> PAGE_SHIFT) #define pfn_to_phys(pfn) PFN_PHYS(pfn) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) #include diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index df42f8aa99e41..580d71aca65a4 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long); #define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include static inline void clear_page(void *page) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 198e03e59ca19..cc6b8e087192e 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,10 +39,6 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 493eb7083b1ae..a77d04972eb92 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -11,6 +11,8 @@ #ifndef _XTENSA_PAGE_H #define _XTENSA_PAGE_H +#include + #include #include #include @@ -189,8 +191,6 @@ static inline unsigned long ___pa(unsigned long va) #endif #define __va(x) \ ((void *)((unsigned long) (x) - PHYS_OFFSET + PAGE_OFFSET)) -#define pfn_valid(pfn) \ - ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index a2c8ed60233a4..13d2a844d9287 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,6 +19,18 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +#ifndef pfn_valid +static inline int pfn_valid(unsigned long pfn) +{ + /* avoid include hell */ + extern unsigned long max_mapnr; + unsigned long pfn_offset = ARCH_PFN_OFFSET; + + return pfn >= pfn_offset && pfn < max_mapnr; +} +#define pfn_valid pfn_valid +#endif + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) /* memmap is virtually contiguous. */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h index 6fc47561814c6..c0be2edeb4840 100644 --- a/include/asm-generic/page.h +++ b/include/asm-generic/page.h @@ -84,8 +84,6 @@ extern unsigned long memory_end; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) #endif -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) - #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) From 87398c226c37f4f240237bcb887d1a7216e4cd4e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 31 Jan 2023 20:41:49 +0200 Subject: [PATCH 106/143] mm-arch-add-generic-implementation-of-pfn_valid-for-flatmem-fix The generic pfn_valid() does not take into account pfn_offset when it compares it with max_mapnr. Link: https://lkml.kernel.org/r/Y9lg7R1Yd931C+y5@kernel.org Signed-off-by: Mike Rapoport Reported-by: Conor Dooley Tested-by: Conor Dooley Signed-off-by: Andrew Morton --- include/asm-generic/memory_model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 13d2a844d9287..6796abe1900e3 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -26,7 +26,7 @@ static inline int pfn_valid(unsigned long pfn) extern unsigned long max_mapnr; unsigned long pfn_offset = ARCH_PFN_OFFSET; - return pfn >= pfn_offset && pfn < max_mapnr; + return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; } #define pfn_valid pfn_valid #endif From 33fa9091df0d16031415ed02f4dbdd77ecc26018 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:28 -0800 Subject: [PATCH 107/143] mm: add folio_estimated_sharers() Patch series "Convert various mempolicy.c functions to use folios", v4. This patch series converts migrate_page_add() and queue_pages_required() to migrate_folio_add() and queue_page_required(). It also converts the callers of the functions to use folios as well, and introduces a helper function to estimate the number of sharers of a folio. This patch (of 6): folio_estimated_sharers() takes in a folio and returns the precise number of times the first subpage of the folio is mapped. This function aims to provide an estimate for the number of sharers of a folio. This is necessary for folio conversions where we care about the number of processes that share a folio, but don't necessarily want to check every single page within that folio. This is in contrast to folio_mapcount() which calculates the total number of the times a folio and all its subpages are mapped. Link: https://lkml.kernel.org/r/20230130201833.27042-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230130201833.27042-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Acked-by: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index cab10a084e03b..eae6034e7f481 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1911,6 +1911,24 @@ static inline size_t folio_size(struct folio *folio) return PAGE_SIZE << folio_order(folio); } +/** + * folio_estimated_sharers - Estimate the number of sharers of a folio. + * @folio: The folio. + * + * folio_estimated_sharers() aims to serve as a function to efficiently + * estimate the number of processes sharing a folio. This is done by + * looking at the precise mapcount of the first subpage in the folio, and + * assuming the other subpages are the same. This may not be true for large + * folios. If you want exact mapcounts for exact calculations, look at + * page_mapcount() or folio_total_mapcount(). + * + * Return: The estimated number of processes sharing a folio. + */ +static inline int folio_estimated_sharers(struct folio *folio) +{ + return page_mapcount(folio_page(folio, 0)); +} + #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { From 12940293acc583028cf8e5af9b74d8cb74ae13f6 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:29 -0800 Subject: [PATCH 108/143] mm/mempolicy: convert queue_pages_pmd() to queue_folios_pmd() The function now operates on a folio instead of the page associated with a pmd. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7686f40c97500..fc754dbcbbcd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -442,21 +442,21 @@ static inline bool queue_pages_required(struct page *page, } /* - * queue_pages_pmd() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pmd() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. huge zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing page was already on a node that does not follow the + * existing folio was already on a node that does not follow the * policy. */ -static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, +static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags; @@ -464,19 +464,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; goto unlock; } - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { + folio = pfn_folio(pmd_pfn(*pmd)); + if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) goto unlock; flags = qp->flags; - /* go to thp migration */ + /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(page, qp->pagelist, flags)) { + migrate_page_add(&folio->page, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -512,7 +512,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) - return queue_pages_pmd(pmd, ptl, addr, end, walk); + return queue_folios_pmd(pmd, ptl, addr, end, walk); if (pmd_trans_unstable(pmd)) return 0; From 5796e78b5ee5be0ce52554c2b2e90cf3b6e51717 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:30 -0800 Subject: [PATCH 109/143] mm/mempolicy: convert queue_pages_pte_range() to queue_folios_pte_range() This function now operates on folios associated with ptes instead of pages. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc754dbcbbcd6..b0805bb87655f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -491,19 +491,19 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * - * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pte_range() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. - * -EIO - only MPOL_MF_STRICT was specified and an existing page was already + * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already * on a node that does not follow the policy. */ -static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; bool has_unmovable = false; @@ -521,16 +521,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -544,7 +544,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(page, qp->pagelist, flags)) + if (migrate_page_add(&folio->page, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -704,7 +704,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, + .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; From d4306f3b647a495a3184fe9462b89b5033f6a1b5 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:31 -0800 Subject: [PATCH 110/143] mm/mempolicy: convert queue_pages_hugetlb() to queue_folios_hugetlb() This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b0805bb87655f..6683924935000 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -558,7 +558,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return addr != end ? -EIO : 0; } -static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -566,7 +566,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -574,13 +574,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; - page = pte_page(entry); - if (!queue_pages_required(page, qp)) + folio = pfn_folio(pte_pfn(entry)); + if (!queue_pages_required(&folio->page, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* - * STRICT alone means only detecting misplaced page and no + * STRICT alone means only detecting misplaced folio and no * need to further check other vma. */ ret = -EIO; @@ -591,21 +591,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. - * Detecting misplaced page but allow migrating pages which + * Detecting misplaced folio but allow migrating folios which * have been queued. */ ret = 1; goto unlock; } - /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + /* + * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it + * is shared it is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. + */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page_folio(page), qp->pagelist) && + if (isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* - * Failed to isolate page but allow migrating pages + * Failed to isolate folio but allow migrating pages * which have been queued. */ ret = 1; @@ -703,7 +710,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } static const struct mm_walk_ops queue_pages_walk_ops = { - .hugetlb_entry = queue_pages_hugetlb, + .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; From 54aa8fb5be9b9bc4237b0c1414aaebe24302a662 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:32 -0800 Subject: [PATCH 111/143] mm/mempolicy: convert queue_pages_required() to queue_folio_required() Replace queue_pages_required() with queue_folio_required(). queue_folio_required() does the same as queue_pages_required(), except takes in a folio instead of a page. Link: https://lkml.kernel.org/r/20230130201833.27042-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6683924935000..6a68dbce3b703 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -427,15 +427,15 @@ struct queue_pages { }; /* - * Check if the page's nid is in qp->nmask. + * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ -static inline bool queue_pages_required(struct page *page, +static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); @@ -469,7 +469,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; flags = qp->flags; @@ -530,7 +530,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, */ if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -575,7 +575,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto unlock; folio = pfn_folio(pte_pfn(entry)); - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { From f41be1182ff36e40aefb784cc926df69a8257405 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:33 -0800 Subject: [PATCH 112/143] mm/mempolicy: convert migrate_page_add() to migrate_folio_add() Replace migrate_page_add() with migrate_folio_add(). migrate_folio_add() does the same a migrate_page_add() but takes in a folio instead of a page. This removes a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20230130201833.27042-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Cc: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- mm/mempolicy.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6a68dbce3b703..0919c7a719d41 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -414,7 +414,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); struct queue_pages { @@ -476,7 +476,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(&folio->page, qp->pagelist, flags)) { + migrate_folio_add(folio, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -544,7 +544,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(&folio->page, qp->pagelist, flags)) + if (migrate_folio_add(folio, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -1021,27 +1021,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -/* - * page migration, thp tail pages can be passed. - */ -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - struct page *head = compound_head(page); /* - * Avoid migrating a page that is shared with others. + * We try to migrate only unshared folios. If it is shared it + * is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if (!folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } else if (flags & MPOL_MF_STRICT) { /* - * Non-movable page may reach here. And, there may be - * temporary off LRU pages or non-LRU movable pages. - * Treat them as unmovable pages since they can't be + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ @@ -1235,7 +1236,7 @@ static struct page *new_page(struct page *page, unsigned long start) } #else -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return -EIO; From edfbb4385e88e441c074c2199f8314003e90a61f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:49 -0800 Subject: [PATCH 113/143] mm: add folio_get_nontail_page() Patch series "Convert a couple migrate functions to use folios", v2. This patchset introduces folio_movable_ops() and converts 3 functions in mm/migrate.c to use folios. It also introduces folio_get_nontail_page() for folio conversions which may want to distinguish between head and tail pages. This patch (of 4): folio_get_nontail_page() returns the folio associated with a head page. This is necessary for folio conversions where the behavior of that function differs between head pages and tail pages. Link: https://lkml.kernel.org/r/20230130214352.40538-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230130214352.40538-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index eae6034e7f481..5bf0ad48faaa2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -885,6 +885,13 @@ static inline bool get_page_unless_zero(struct page *page) return page_ref_add_unless(page, 1, 0); } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} + extern int page_is_ram(unsigned long pfn); enum { From ed9c3c1fdf3984bb80be05c206beabbba01335aa Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:50 -0800 Subject: [PATCH 114/143] mm/migrate: add folio_movable_ops() folio_movable_ops() does the same as page_movable_ops() except uses folios instead of pages. This function will help make folio conversions in migrate.c more readable. Link: https://lkml.kernel.org/r/20230130214352.40538-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/migrate.h | 9 +++++++++ mm/migrate.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3ef77f52a4f04..bdff950a8bb45 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -122,6 +122,15 @@ static inline bool folio_test_movable(struct folio *folio) return PageMovable(&folio->page); } +static inline +const struct movable_operations *folio_movable_ops(struct folio *folio) +{ + VM_BUG_ON(!__folio_test_movable(folio)); + + return (const struct movable_operations *) + ((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE); +} + static inline const struct movable_operations *page_movable_ops(struct page *page) { diff --git a/mm/migrate.c b/mm/migrate.c index c09872cf41b76..d2b1167329b93 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -990,7 +990,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - mops = page_movable_ops(&src->page); + mops = folio_movable_ops(src); rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); From 0c838026cbdb5752ebe16a86b2b5e70ce424dff3 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:51 -0800 Subject: [PATCH 115/143] mm/migrate: convert isolate_movable_page() to use folios Removes 6 calls to compound_head() and prepares the function to take in a folio instead of page argument. Link: https://lkml.kernel.org/r/20230130214352.40538-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d2b1167329b93..3cdb76e44ef52 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -60,6 +60,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { + struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; /* @@ -71,11 +72,11 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage. */ - if (unlikely(!get_page_unless_zero(page))) + if (!folio) goto out; - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ smp_rmb(); /* @@ -83,12 +84,12 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. */ - if (unlikely(!__PageMovable(page))) - goto out_putpage; + if (unlikely(!__folio_test_movable(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ smp_rmb(); - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -101,29 +102,29 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * lets be sure we have the page lock * before proceeding with the movable page isolation steps. */ - if (unlikely(!trylock_page(page))) - goto out_putpage; + if (unlikely(!folio_trylock(folio))) + goto out_putfolio; - if (!PageMovable(page) || PageIsolated(page)) + if (!folio_test_movable(folio) || folio_test_isolated(folio)) goto out_no_isolated; - mops = page_movable_ops(page); - VM_BUG_ON_PAGE(!mops, page); + mops = folio_movable_ops(folio); + VM_BUG_ON_FOLIO(!mops, folio); - if (!mops->isolate_page(page, mode)) + if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ - WARN_ON_ONCE(PageIsolated(page)); - SetPageIsolated(page); - unlock_page(page); + WARN_ON_ONCE(folio_test_isolated(folio)); + folio_set_isolated(folio); + folio_unlock(folio); return 0; out_no_isolated: - unlock_page(page); -out_putpage: - put_page(page); + folio_unlock(folio); +out_putfolio: + folio_put(folio); out: return -EBUSY; } From 8ea0cd0acc30d77851d22394e9de96401b1d18ab Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:52 -0800 Subject: [PATCH 116/143] mm/migrate: convert putback_movable_pages() to use folios Removes 6 calls to compound_head(), and replaces putback_movable_page() with putback_movable_folio() as well. Link: https://lkml.kernel.org/r/20230130214352.40538-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 3cdb76e44ef52..5b40b9040ba60 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -129,12 +129,12 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) return -EBUSY; } -static void putback_movable_page(struct page *page) +static void putback_movable_folio(struct folio *folio) { - const struct movable_operations *mops = page_movable_ops(page); + const struct movable_operations *mops = folio_movable_ops(folio); - mops->putback_page(page); - ClearPageIsolated(page); + mops->putback_page(&folio->page); + folio_clear_isolated(folio); } /* @@ -147,33 +147,33 @@ static void putback_movable_page(struct page *page) */ void putback_movable_pages(struct list_head *l) { - struct page *page; - struct page *page2; + struct folio *folio; + struct folio *folio2; - list_for_each_entry_safe(page, page2, l, lru) { - if (unlikely(PageHuge(page))) { - folio_putback_active_hugetlb(page_folio(page)); + list_for_each_entry_safe(folio, folio2, l, lru) { + if (unlikely(folio_test_hugetlb(folio))) { + folio_putback_active_hugetlb(folio); continue; } - list_del(&page->lru); + list_del(&folio->lru); /* - * We isolated non-lru movable page so here we can use - * __PageMovable because LRU page's mapping cannot have + * We isolated non-lru movable folio so here we can use + * __PageMovable because LRU folio's mapping cannot have * PAGE_MAPPING_MOVABLE. */ - if (unlikely(__PageMovable(page))) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); - lock_page(page); - if (PageMovable(page)) - putback_movable_page(page); + if (unlikely(__folio_test_movable(folio))) { + VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); + folio_lock(folio); + if (folio_test_movable(folio)) + putback_movable_folio(folio); else - ClearPageIsolated(page); - unlock_page(page); - put_page(page); + folio_clear_isolated(folio); + folio_unlock(folio); + folio_put(folio); } else { - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); - putback_lru_page(page); + node_stat_mod_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio), -folio_nr_pages(folio)); + folio_putback_lru(folio); } } } From 3d7b46dfe6926d758d5f40271a9a588a39102401 Mon Sep 17 00:00:00 2001 From: Longlong Xia Date: Tue, 31 Jan 2023 07:10:35 +0000 Subject: [PATCH 117/143] mm/swapfile: remove pr_debug in get_swap_pages() It's known that get_swap_pages() may fail to find available space under some extreme case, but pr_debug() provides useless information. Let's remove it. Link: https://lkml.kernel.org/r/20230131071035.1085968-1-xialonglong1@huawei.com Signed-off-by: Longlong Xia Reviewed-by: "Huang, Ying" Cc: Chen Wandun Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 08af572c0c4e1..da8b8e1f97f14 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1098,8 +1098,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) spin_unlock(&si->lock); if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; - pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); cond_resched(); spin_lock(&swap_avail_lock); From 6aa4b8706b843db405cc063b8c699c35ca840a20 Mon Sep 17 00:00:00 2001 From: Alexander Halbuer Date: Wed, 1 Feb 2023 17:25:49 +0100 Subject: [PATCH 118/143] mm: reduce lock contention of pcp buffer refill rmqueue_bulk() batches the allocation of multiple elements to refill the per-CPU buffers into a single hold of the zone lock. Each element is allocated and checked using check_pcp_refill(). The check touches every related struct page which is especially expensive for higher order allocations (huge pages). This patch reduces the time holding the lock by moving the check out of the critical section similar to rmqueue_buddy() which allocates a single element. Measurements of parallel allocation-heavy workloads show a reduction of the average huge page allocation latency of 50 percent for two cores and nearly 90 percent for 24 cores. Link: https://lkml.kernel.org/r/20230201162549.68384-1-halbuer@sra.uni-hannover.de Signed-off-by: Alexander Halbuer Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ebce58026f12..cd52b45565a83 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3140,6 +3140,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { unsigned long flags; int i, allocated = 0; + struct list_head *prev_tail = list->prev; + struct page *pos, *n; spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { @@ -3148,9 +3150,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; - if (unlikely(check_pcp_refill(page, order))) - continue; - /* * Split buddy pages returned by expand() are received here in * physical page order. The page is added to the tail of @@ -3162,7 +3161,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -3176,6 +3174,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); + + /* + * Pages are appended to the pcp list without checking to reduce the + * time holding the zone lock. Checking the appended pages happens right + * after the critical section while still holding the pcp lock. + */ + pos = list_first_entry(prev_tail, struct page, pcp_list); + list_for_each_entry_safe_from(pos, n, list, pcp_list) { + if (unlikely(check_pcp_refill(pos, order))) { + list_del(&pos->pcp_list); + continue; + } + + allocated++; + } + return allocated; } From fb66df5ad426c4c385238118dd59327dcb2f39a6 Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Wed, 1 Feb 2023 20:51:42 +0900 Subject: [PATCH 119/143] mm/vmalloc: replace BUG_ON with a simple if statement As per the coding standards, in the event of an abnormal condition that should not occur under normal circumstances, the kernel should attempt recovery and proceed with execution, rather than halting the machine. Specifically, in the alloc_vmap_area() function, use a simple if() instead of using BUG_ON() halting the machine. Link: https://lkml.kernel.org/r/20230201115142.GA7772@min-iamroot Co-developed-by: Gwan-gyeong Mun Signed-off-by: Gwan-gyeong Mun Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Co-developed-by: Sangyun Kim Signed-off-by: Sangyun Kim Signed-off-by: Hyunmin Lee Reviewed-by: Christophe Leroy Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ff4d7dfdf84ad..1dd7ca258a765 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1586,9 +1586,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int purged = 0; int ret; - BUG_ON(!size); - BUG_ON(offset_in_page(size)); - BUG_ON(!is_power_of_2(align)); + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) + return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); From 487ef1c5caebca36ea4e953631c99ea3066e0687 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 Jan 2023 16:01:16 -0800 Subject: [PATCH 120/143] mm: introduce vm_flags_reset_once to replace WRITE_ONCE vm_flags updates Provide vm_flags_reset_once() and replace the vm_flags updates which used WRITE_ONCE() to prevent compiler optimizations. Link: https://lkml.kernel.org/r/20230201000116.1333160-1-surenb@google.com Fixes: 0cce31a0aa0e ("mm: replace vma->vm_flags direct modifications with modifier calls") Signed-off-by: Suren Baghdasaryan Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ mm/mlock.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5bf0ad48faaa2..23ce04f6e91ee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -648,6 +648,13 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); +} + static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { diff --git a/mm/mlock.c b/mm/mlock.c index ed49459e343e5..617469fce96d2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); } } From f76e1deacdcb6b53f2ae1697a54db713d819e2ee Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Fri, 3 Feb 2023 18:01:32 +0800 Subject: [PATCH 121/143] mm/page_alloc: reduce fallbacks to (MIGRATE_PCPTYPES - 1) The commit 1dd214b8f21c ("mm: page_alloc: avoid merging non-fallbackable pageblocks with others") has removed MIGRATE_CMA and MIGRATE_ISOLATE from fallbacks list. so there is no need to add an element at the end of every type. Reduce fallbacks to (MIGRATE_PCPTYPES - 1). Link: https://lkml.kernel.org/r/20230203100132.1627787-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Cc: Zi Yan Cc: Mel Gorman Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Oscar Salvador Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cd52b45565a83..1113483fa6c5e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2603,10 +2603,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, }; #ifdef CONFIG_CMA @@ -2865,11 +2865,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; *can_steal = false; - for (i = 0;; i++) { + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_TYPES) - break; - if (free_area_empty(area, fallback_mt)) continue; From c6433757451971dafd470f947b9c8d72a0972ed8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:25 +0100 Subject: [PATCH 122/143] lib/stackdepot: fix setting next_slab_inited in init_stack_slab Patch series "lib/stackdepot: fixes and clean-ups". A set of fixes, comments, and clean-ups I came up with while reading the stack depot code. This patch (of 18): In commit 305e519ce48e ("lib/stackdepot.c: fix global out-of-bounds in stack_slabs"), init_stack_slab was changed to only use preallocated memory for the next slab if the slab number limit is not reached. However, setting next_slab_inited was not moved together with updating stack_slabs. Set next_slab_inited only if the preallocated memory was used for the next slab. Link: https://lkml.kernel.org/r/cover.1675111415.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/9fbb4d2bf9b2676a29b120980b5ffbda8e2304ee.1675111415.git.andreyknvl@google.com Fixes: 305e519ce48e ("lib/stackdepot.c: fix global out-of-bounds in stack_slabs") Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 79e894cf84064..0eed9bbcf23e1 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -105,12 +105,13 @@ static bool init_stack_slab(void **prealloc) if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { stack_slabs[depot_index + 1] = *prealloc; *prealloc = NULL; + /* + * This smp_store_release pairs with smp_load_acquire() + * from |next_slab_inited| above and in + * stack_depot_save(). + */ + smp_store_release(&next_slab_inited, 1); } - /* - * This smp_store_release pairs with smp_load_acquire() from - * |next_slab_inited| above and in stack_depot_save(). - */ - smp_store_release(&next_slab_inited, 1); } return true; } From f70f6e790a3d71815ecc8549d5ab17526d58fffb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:26 +0100 Subject: [PATCH 123/143] lib/stackdepot: put functions in logical order Put stack depot functions' declarations and definitions in a more logical order: 1. Functions that save stack traces into stack depot. 2. Functions that fetch and print stack traces. 3. stack_depot_get_extra_bits that operates on stack depot handles and does not interact with the stack depot storage. No functional changes. Link: https://lkml.kernel.org/r/632393332c364171c69b7c054b3b2233acbfa996.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 15 +- lib/stackdepot.c | 316 ++++++++++++++++++------------------- 2 files changed, 166 insertions(+), 165 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 9ca7798d7a318..1296a6eeaec0a 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,17 +14,13 @@ #include typedef u32 depot_stack_handle_t; + /* * Number of bits in the handle that stack depot doesn't use. Users may store * information in them. */ #define STACK_DEPOT_EXTRA_BITS 5 -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - unsigned int extra_bits, - gfp_t gfp_flags, bool can_alloc); - /* * Every user of stack depot has to call stack_depot_init() during its own init * when it's decided that it will be calling stack_depot_save() later. This is @@ -59,17 +55,22 @@ static inline void stack_depot_want_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + unsigned int extra_bits, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); +void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); -void stack_depot_print(depot_stack_handle_t stack); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 0eed9bbcf23e1..23d2a68a587b0 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -79,85 +79,6 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) -{ - union handle_parts parts = { .handle = handle }; - - return parts.extra; -} -EXPORT_SYMBOL(stack_depot_get_extra_bits); - -static bool init_stack_slab(void **prealloc) -{ - if (!*prealloc) - return false; - /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). - */ - if (smp_load_acquire(&next_slab_inited)) - return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; - *prealloc = NULL; - } else { - /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; - *prealloc = NULL; - /* - * This smp_store_release pairs with smp_load_acquire() - * from |next_slab_inited| above and in - * stack_depot_save(). - */ - smp_store_release(&next_slab_inited, 1); - } - } - return true; -} - -/* Allocation of a new stack in raw storage */ -static struct stack_record * -depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) -{ - struct stack_record *stack; - size_t required_size = struct_size(stack, entries, size); - - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { - WARN_ONCE(1, "Stack depot reached limit capacity"); - return NULL; - } - depot_index++; - depot_offset = 0; - /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). - */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) - smp_store_release(&next_slab_inited, 0); - } - init_stack_slab(prealloc); - if (stack_slabs[depot_index] == NULL) - return NULL; - - stack = stack_slabs[depot_index] + depot_offset; - - stack->hash = hash; - stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; - stack->handle.valid = 1; - stack->handle.extra = 0; - memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; - - return stack; -} - /* one hash table bucket entry per 16kB of memory */ #define STACK_HASH_SCALE 14 /* limited between 4k and 1M buckets */ @@ -271,6 +192,77 @@ int stack_depot_init(void) } EXPORT_SYMBOL_GPL(stack_depot_init); +static bool init_stack_slab(void **prealloc) +{ + if (!*prealloc) + return false; + /* + * This smp_load_acquire() pairs with smp_store_release() to + * |next_slab_inited| below and in depot_alloc_stack(). + */ + if (smp_load_acquire(&next_slab_inited)) + return true; + if (stack_slabs[depot_index] == NULL) { + stack_slabs[depot_index] = *prealloc; + *prealloc = NULL; + } else { + /* If this is the last depot slab, do not touch the next one. */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { + stack_slabs[depot_index + 1] = *prealloc; + *prealloc = NULL; + /* + * This smp_store_release pairs with smp_load_acquire() + * from |next_slab_inited| above and in + * stack_depot_save(). + */ + smp_store_release(&next_slab_inited, 1); + } + } + return true; +} + +/* Allocation of a new stack in raw storage */ +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +{ + struct stack_record *stack; + size_t required_size = struct_size(stack, entries, size); + + required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + + if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return NULL; + } + depot_index++; + depot_offset = 0; + /* + * smp_store_release() here pairs with smp_load_acquire() from + * |next_slab_inited| in stack_depot_save() and + * init_stack_slab(). + */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) + smp_store_release(&next_slab_inited, 0); + } + init_stack_slab(prealloc); + if (stack_slabs[depot_index] == NULL) + return NULL; + + stack = stack_slabs[depot_index] + depot_offset; + + stack->hash = hash; + stack->size = size; + stack->handle.slabindex = depot_index; + stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + stack->handle.valid = 1; + stack->handle.extra = 0; + memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); + depot_offset += required_size; + + return stack; +} + /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { @@ -310,85 +302,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * stack_depot_snprint - print stack entries from a depot into a buffer - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @buf: Pointer to the print buffer - * - * @size: Size of the print buffer - * - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed. - */ -int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, - int spaces) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(handle, &entries); - return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, - spaces) : 0; -} -EXPORT_SYMBOL_GPL(stack_depot_snprint); - -/** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). - * - */ -void stack_depot_print(depot_stack_handle_t stack) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(stack, &entries); - if (nr_entries > 0) - stack_trace_print(entries, nr_entries, 0); -} -EXPORT_SYMBOL_GPL(stack_depot_print); - -/** - * stack_depot_fetch - Fetch stack entries from a depot - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address - * - * Return: The number of trace entries for this depot. - */ -unsigned int stack_depot_fetch(depot_stack_handle_t handle, - unsigned long **entries) -{ - union handle_parts parts = { .handle = handle }; - void *slab; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; - struct stack_record *stack; - - *entries = NULL; - if (!handle) - return 0; - - if (parts.slabindex > depot_index) { - WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); - return 0; - } - slab = stack_slabs[parts.slabindex]; - if (!slab) - return 0; - stack = slab + offset; - - *entries = stack->entries; - return stack->size; -} -EXPORT_SYMBOL_GPL(stack_depot_fetch); - /** * __stack_depot_save - Save a stack trace from an array * @@ -534,3 +447,90 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); + +/** + * stack_depot_fetch - Fetch stack entries from a depot + * + * @handle: Stack depot handle which was returned from + * stack_depot_save(). + * @entries: Pointer to store the entries address + * + * Return: The number of trace entries for this depot. + */ +unsigned int stack_depot_fetch(depot_stack_handle_t handle, + unsigned long **entries) +{ + union handle_parts parts = { .handle = handle }; + void *slab; + size_t offset = parts.offset << STACK_ALLOC_ALIGN; + struct stack_record *stack; + + *entries = NULL; + if (!handle) + return 0; + + if (parts.slabindex > depot_index) { + WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", + parts.slabindex, depot_index, handle); + return 0; + } + slab = stack_slabs[parts.slabindex]; + if (!slab) + return 0; + stack = slab + offset; + + *entries = stack->entries; + return stack->size; +} +EXPORT_SYMBOL_GPL(stack_depot_fetch); + +/** + * stack_depot_print - print stack entries from a depot + * + * @stack: Stack depot handle which was returned from + * stack_depot_save(). + * + */ +void stack_depot_print(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + if (nr_entries > 0) + stack_trace_print(entries, nr_entries, 0); +} +EXPORT_SYMBOL_GPL(stack_depot_print); + +/** + * stack_depot_snprint - print stack entries from a depot into a buffer + * + * @handle: Stack depot handle which was returned from + * stack_depot_save(). + * @buf: Pointer to the print buffer + * + * @size: Size of the print buffer + * + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, + int spaces) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(handle, &entries); + return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, + spaces) : 0; +} +EXPORT_SYMBOL_GPL(stack_depot_snprint); + +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); From f43641a81ba80cb5d49bd5c6a80c65878dbf7b4b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:27 +0100 Subject: [PATCH 124/143] lib/stackdepot: use pr_fmt to define message format Use pr_fmt to define the format for printing stack depot messages instead of duplicating the "Stack Depot" prefix in each message. Link: https://lkml.kernel.org/r/3600069e0b0b3df602999ec8a2d4fc14fcc56a01.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 23d2a68a587b0..90c4dd48d75e2 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -19,6 +19,8 @@ * Based on code by Dmitry Chernenkov. */ +#define pr_fmt(fmt) "stackdepot: " fmt + #include #include #include @@ -98,7 +100,7 @@ static int __init is_stack_depot_disabled(char *str) ret = kstrtobool(str, &stack_depot_disable); if (!ret && stack_depot_disable) { - pr_info("Stack Depot is disabled\n"); + pr_info("disabled\n"); stack_table = NULL; } return 0; @@ -142,7 +144,7 @@ int __init stack_depot_early_init(void) 1UL << STACK_HASH_ORDER_MAX); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); + pr_err("hash table allocation failed, disabling\n"); stack_depot_disable = true; return -ENOMEM; } @@ -177,11 +179,11 @@ int stack_depot_init(void) if (entries > 1UL << STACK_HASH_ORDER_MAX) entries = 1UL << STACK_HASH_ORDER_MAX; - pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", + pr_info("allocating hash table of %lu entries with kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); + pr_err("hash table allocation failed, disabling\n"); stack_depot_disable = true; ret = -ENOMEM; } From 9da8ad32a1dcde5c58372c763a0ce9f8b4b726f8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:28 +0100 Subject: [PATCH 125/143] lib/stackdepot, mm: rename stack_depot_want_early_init Rename stack_depot_want_early_init to stack_depot_request_early_init. The old name is confusing, as it hints at returning some kind of intention of stack depot. The new name reflects that this function requests an action from stack depot instead. No functional changes. Link: https://lkml.kernel.org/r/cb34925852c81be2ec6aac75766292e4e590523e.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 14 +++++++------- lib/stackdepot.c | 10 +++++----- mm/page_owner.c | 2 +- mm/slub.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 1296a6eeaec0a..c4e3abc16b16a 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -31,26 +31,26 @@ typedef u32 depot_stack_handle_t; * enabled as part of mm_init(), for subsystems where it's known at compile time * that stack depot will be used. * - * Another alternative is to call stack_depot_want_early_init(), when the + * Another alternative is to call stack_depot_request_early_init(), when the * decision to use stack depot is taken e.g. when evaluating kernel boot * parameters, which precedes the enablement point in mm_init(). * - * stack_depot_init() and stack_depot_want_early_init() can be called regardless - * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print - * functions should only be called from code that makes sure CONFIG_STACKDEPOT - * is enabled. + * stack_depot_init() and stack_depot_request_early_init() can be called + * regardless of CONFIG_STACKDEPOT and are no-op when disabled. The actual + * save/fetch/print functions should only be called from code that makes sure + * CONFIG_STACKDEPOT is enabled. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -void __init stack_depot_want_early_init(void); +void __init stack_depot_request_early_init(void); /* This is supposed to be called only from mm_init() */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } -static inline void stack_depot_want_early_init(void) { } +static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 90c4dd48d75e2..8743fad1485f0 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,7 +71,7 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; -static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; @@ -107,12 +107,12 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -void __init stack_depot_want_early_init(void) +void __init stack_depot_request_early_init(void) { - /* Too late to request early init now */ + /* Too late to request early init now. */ WARN_ON(__stack_depot_early_init_passed); - __stack_depot_want_early_init = true; + __stack_depot_early_init_requested = true; } int __init stack_depot_early_init(void) @@ -128,7 +128,7 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; - if (!__stack_depot_want_early_init || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disable) return 0; if (stack_hash_order) diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d27f532df4c1..90a4a087e6c7e 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -48,7 +48,7 @@ static int __init early_page_owner_param(char *buf) int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) - stack_depot_want_early_init(); + stack_depot_request_early_init(); return ret; } diff --git a/mm/slub.c b/mm/slub.c index 13459c69095a2..f2c6c356bc360 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1592,7 +1592,7 @@ static int __init setup_slub_debug(char *str) } else { slab_list_specified = true; if (flags & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); } } @@ -1611,7 +1611,7 @@ static int __init setup_slub_debug(char *str) out: slub_debug = global_flags; if (slub_debug & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else From 25290a241a07163bc9b9900362e2706fb1a1c53f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 30 Jan 2023 17:43:08 -0800 Subject: [PATCH 126/143] lib-stackdepot-mm-rename-stack_depot_want_early_init-fix fix it for "mm: use stack_depot_early_init for kmemleak" Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92f670edbf518..2d4f685f77ea3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -104,6 +104,9 @@ #include #include +/* akpm: stupid temporary hack */ +#define stack_depot_want_early_init stack_depot_request_early_init + /* * Kmemleak configuration and common defines. */ From 1231d34b4c8d2d4fd80044d9e64e69808b1810a7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:29 +0100 Subject: [PATCH 127/143] lib/stackdepot: rename stack_depot_disable Rename stack_depot_disable to stack_depot_disabled to make its name look similar to the names of other stack depot flags. Also put stack_depot_disabled's definition together with the other flags. Also rename is_stack_depot_disabled to disable_stack_depot: this name looks more conventional for a function that processes a boot parameter. No functional changes. Link: https://lkml.kernel.org/r/293567627b0d59f1ae5a27ac9537c027a5ff729d.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 8743fad1485f0..6e8aef12cf890 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,6 +71,7 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; +static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; @@ -91,21 +92,20 @@ static DEFINE_RAW_SPINLOCK(depot_lock); static unsigned int stack_hash_order; static unsigned int stack_hash_mask; -static bool stack_depot_disable; static struct stack_record **stack_table; -static int __init is_stack_depot_disabled(char *str) +static int __init disable_stack_depot(char *str) { int ret; - ret = kstrtobool(str, &stack_depot_disable); - if (!ret && stack_depot_disable) { + ret = kstrtobool(str, &stack_depot_disabled); + if (!ret && stack_depot_disabled) { pr_info("disabled\n"); stack_table = NULL; } return 0; } -early_param("stack_depot_disable", is_stack_depot_disabled); +early_param("stack_depot_disable", disable_stack_depot); void __init stack_depot_request_early_init(void) { @@ -128,7 +128,7 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; - if (!__stack_depot_early_init_requested || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; if (stack_hash_order) @@ -145,7 +145,7 @@ int __init stack_depot_early_init(void) if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); - stack_depot_disable = true; + stack_depot_disabled = true; return -ENOMEM; } @@ -158,7 +158,7 @@ int stack_depot_init(void) int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disable && !stack_table) { + if (!stack_depot_disabled && !stack_table) { unsigned long entries; int scale = STACK_HASH_SCALE; @@ -184,7 +184,7 @@ int stack_depot_init(void) stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); - stack_depot_disable = true; + stack_depot_disabled = true; ret = -ENOMEM; } stack_hash_mask = entries - 1; @@ -354,7 +354,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, */ nr_entries = filter_irq_stacks(entries, nr_entries); - if (unlikely(nr_entries == 0) || stack_depot_disable) + if (unlikely(nr_entries == 0) || stack_depot_disabled) goto fast_exit; hash = hash_stack(entries, nr_entries); From 5dbe11ac31cc23ec46c0c6791f6eb15bba8ef3b7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:30 +0100 Subject: [PATCH 128/143] lib/stackdepot: annotate init and early init functions Add comments to stack_depot_early_init and stack_depot_init to explain certain parts of their implementation. Also add a pr_info message to stack_depot_early_init similar to the one in stack_depot_init. Also move the scale variable in stack_depot_init to the scope where it is being used. Link: https://lkml.kernel.org/r/be09b64fb196ffe0c19ce7afc4130efba5425df9.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 6e8aef12cf890..b06f6a5caa833 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -115,24 +115,34 @@ void __init stack_depot_request_early_init(void) __stack_depot_early_init_requested = true; } +/* Allocates a hash table via memblock. Can only be used during early boot. */ int __init stack_depot_early_init(void) { unsigned long entries = 0; - /* This is supposed to be called only once, from mm_init() */ + /* This function must be called only once, from mm_init(). */ if (WARN_ON(__stack_depot_early_init_passed)) return 0; - __stack_depot_early_init_passed = true; + /* + * If KASAN is enabled, use the maximum order: KASAN is frequently used + * in fuzzing scenarios, which leads to a large number of different + * stack traces being stored in stack depot. + */ if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; + /* + * If stack_hash_order is not set, leave entries as 0 to rely on the + * automatic calculations performed by alloc_large_system_hash. + */ if (stack_hash_order) - entries = 1UL << stack_hash_order; + entries = 1UL << stack_hash_order; + pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, @@ -142,7 +152,6 @@ int __init stack_depot_early_init(void) &stack_hash_mask, 1UL << STACK_HASH_ORDER_MIN, 1UL << STACK_HASH_ORDER_MAX); - if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -152,6 +161,7 @@ int __init stack_depot_early_init(void) return 0; } +/* Allocates a hash table via kvmalloc. Can be used after boot. */ int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); @@ -160,11 +170,16 @@ int stack_depot_init(void) mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disabled && !stack_table) { unsigned long entries; - int scale = STACK_HASH_SCALE; + /* + * Similarly to stack_depot_early_init, use stack_hash_order + * if assigned, and rely on automatic scaling otherwise. + */ if (stack_hash_order) { entries = 1UL << stack_hash_order; } else { + int scale = STACK_HASH_SCALE; + entries = nr_free_buffer_pages(); entries = roundup_pow_of_two(entries); @@ -179,7 +194,7 @@ int stack_depot_init(void) if (entries > 1UL << STACK_HASH_ORDER_MAX) entries = 1UL << STACK_HASH_ORDER_MAX; - pr_info("allocating hash table of %lu entries with kvcalloc\n", + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { From 6b9449681861ac2fb698034cd0229531eb95c8e1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:31 +0100 Subject: [PATCH 129/143] lib/stackdepot: lower the indentation in stack_depot_init stack_depot_init does most things inside an if check. Move them out and use a goto statement instead. No functional changes. Link: https://lkml.kernel.org/r/eb6f0a014b8d0bfa73a8bbd358c627dc66cf51b7.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 70 +++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b06f6a5caa833..cb098bc992863 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -165,46 +165,50 @@ int __init stack_depot_early_init(void) int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); + unsigned long entries; int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disabled && !stack_table) { - unsigned long entries; - /* - * Similarly to stack_depot_early_init, use stack_hash_order - * if assigned, and rely on automatic scaling otherwise. - */ - if (stack_hash_order) { - entries = 1UL << stack_hash_order; - } else { - int scale = STACK_HASH_SCALE; - - entries = nr_free_buffer_pages(); - entries = roundup_pow_of_two(entries); - - if (scale > PAGE_SHIFT) - entries >>= (scale - PAGE_SHIFT); - else - entries <<= (PAGE_SHIFT - scale); - } + if (stack_depot_disabled || stack_table) + goto out_unlock; - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; - - pr_info("allocating hash table of %lu entries via kvcalloc\n", - entries); - stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); - if (!stack_table) { - pr_err("hash table allocation failed, disabling\n"); - stack_depot_disabled = true; - ret = -ENOMEM; - } - stack_hash_mask = entries - 1; + /* + * Similarly to stack_depot_early_init, use stack_hash_order + * if assigned, and rely on automatic scaling otherwise. + */ + if (stack_hash_order) { + entries = 1UL << stack_hash_order; + } else { + int scale = STACK_HASH_SCALE; + + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); + + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); } + + if (entries < 1UL << STACK_HASH_ORDER_MIN) + entries = 1UL << STACK_HASH_ORDER_MIN; + if (entries > 1UL << STACK_HASH_ORDER_MAX) + entries = 1UL << STACK_HASH_ORDER_MAX; + + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { + pr_err("hash table allocation failed, disabling\n"); + stack_depot_disabled = true; + ret = -ENOMEM; + goto out_unlock; + } + stack_hash_mask = entries - 1; + +out_unlock: mutex_unlock(&stack_depot_init_mutex); + return ret; } EXPORT_SYMBOL_GPL(stack_depot_init); From 7bc1983d87690058ffad0ed5a93863058d4125db Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:32 +0100 Subject: [PATCH 130/143] lib/stackdepot: reorder and annotate global variables Group stack depot global variables by their purpose: 1. Hash table-related variables, 2. Slab-related variables, and add comments. Also clean up comments for hash table-related constants. Link: https://lkml.kernel.org/r/4ed1d0828e837e15566a7cfa7688a47006e3f4b3.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cb098bc992863..89aee133303a5 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -75,24 +75,31 @@ static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; - -static int depot_index; -static int next_slab_inited; -static size_t depot_offset; -static DEFINE_RAW_SPINLOCK(depot_lock); - -/* one hash table bucket entry per 16kB of memory */ +/* Use one hash table bucket per 16 KB of memory. */ #define STACK_HASH_SCALE 14 -/* limited between 4k and 1M buckets */ +/* Limit the number of buckets between 4K and 1M. */ #define STACK_HASH_ORDER_MIN 12 #define STACK_HASH_ORDER_MAX 20 +/* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c +/* Hash table of pointers to stored stack traces. */ +static struct stack_record **stack_table; +/* Fixed order of the number of table buckets. Used when KASAN is enabled. */ static unsigned int stack_hash_order; +/* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; -static struct stack_record **stack_table; +/* Array of memory regions that store stack traces. */ +static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; +/* Currently used slab in stack_slabs. */ +static int depot_index; +/* Offset to the unused space in the currently used slab. */ +static size_t depot_offset; +/* Lock that protects the variables above. */ +static DEFINE_RAW_SPINLOCK(depot_lock); +/* Whether the next slab is initialized. */ +static int next_slab_inited; static int __init disable_stack_depot(char *str) { From ccb9a9980b6eb2b8e71df051771df2ae3fc85678 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:33 +0100 Subject: [PATCH 131/143] lib/stackdepot: rename hash table constants and variables Give more meaningful names to hash table-related constants and variables: 1. Rename STACK_HASH_SCALE to STACK_TABLE_SCALE to point out that it is related to scaling the hash table. 2. Rename STACK_HASH_ORDER_MIN/MAX to STACK_BUCKET_NUMBER_ORDER_MIN/MAX to point out that it is related to the number of hash table buckets. 3. Rename stack_hash_order to stack_bucket_number_order for the same reason as #2. No functional changes. Link: https://lkml.kernel.org/r/5456286e2c9f3cd5abf25ad2e7e60dc997c71f66.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 89aee133303a5..cddcf029e3073 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -76,17 +76,17 @@ static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_ST static bool __stack_depot_early_init_passed __initdata; /* Use one hash table bucket per 16 KB of memory. */ -#define STACK_HASH_SCALE 14 +#define STACK_TABLE_SCALE 14 /* Limit the number of buckets between 4K and 1M. */ -#define STACK_HASH_ORDER_MIN 12 -#define STACK_HASH_ORDER_MAX 20 +#define STACK_BUCKET_NUMBER_ORDER_MIN 12 +#define STACK_BUCKET_NUMBER_ORDER_MAX 20 /* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c /* Hash table of pointers to stored stack traces. */ static struct stack_record **stack_table; /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ -static unsigned int stack_hash_order; +static unsigned int stack_bucket_number_order; /* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; @@ -137,28 +137,28 @@ int __init stack_depot_early_init(void) * in fuzzing scenarios, which leads to a large number of different * stack traces being stored in stack depot. */ - if (kasan_enabled() && !stack_hash_order) - stack_hash_order = STACK_HASH_ORDER_MAX; + if (kasan_enabled() && !stack_bucket_number_order) + stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; /* - * If stack_hash_order is not set, leave entries as 0 to rely on the - * automatic calculations performed by alloc_large_system_hash. + * If stack_bucket_number_order is not set, leave entries as 0 to rely + * on the automatic calculations performed by alloc_large_system_hash. */ - if (stack_hash_order) - entries = 1UL << stack_hash_order; + if (stack_bucket_number_order) + entries = 1UL << stack_bucket_number_order; pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, - STACK_HASH_SCALE, + STACK_TABLE_SCALE, HASH_EARLY | HASH_ZERO, NULL, &stack_hash_mask, - 1UL << STACK_HASH_ORDER_MIN, - 1UL << STACK_HASH_ORDER_MAX); + 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, + 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -181,13 +181,13 @@ int stack_depot_init(void) goto out_unlock; /* - * Similarly to stack_depot_early_init, use stack_hash_order + * Similarly to stack_depot_early_init, use stack_bucket_number_order * if assigned, and rely on automatic scaling otherwise. */ - if (stack_hash_order) { - entries = 1UL << stack_hash_order; + if (stack_bucket_number_order) { + entries = 1UL << stack_bucket_number_order; } else { - int scale = STACK_HASH_SCALE; + int scale = STACK_TABLE_SCALE; entries = nr_free_buffer_pages(); entries = roundup_pow_of_two(entries); @@ -198,10 +198,10 @@ int stack_depot_init(void) entries <<= (PAGE_SHIFT - scale); } - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; + if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; + if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); From f191e21a9d923fabaafc42a24dc2dbbe91a419db Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:34 +0100 Subject: [PATCH 132/143] lib/stackdepot: rename init_stack_slab Rename init_stack_slab to depot_init_slab to align the name with depot_alloc_stack. No functional changes. Link: https://lkml.kernel.org/r/b756e381a3526c6e59cb68c53ac0f172ddd22776.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cddcf029e3073..69b9316b0d4b5 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -220,7 +220,7 @@ int stack_depot_init(void) } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool init_stack_slab(void **prealloc) +static bool depot_init_slab(void **prealloc) { if (!*prealloc) return false; @@ -268,12 +268,12 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) /* * smp_store_release() here pairs with smp_load_acquire() from * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). + * depot_init_slab(). */ if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) smp_store_release(&next_slab_inited, 0); } - init_stack_slab(prealloc); + depot_init_slab(prealloc); if (stack_slabs[depot_index] == NULL) return NULL; @@ -402,7 +402,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * lock. * * The smp_load_acquire() here pairs with smp_store_release() to - * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). + * |next_slab_inited| in depot_alloc_stack() and depot_init_slab(). */ if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { /* @@ -438,7 +438,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!init_stack_slab(&prealloc)); + WARN_ON(!depot_init_slab(&prealloc)); } raw_spin_unlock_irqrestore(&depot_lock, flags); From a1512968580ae4f65c4d73be8ad299be62d49c6d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:35 +0100 Subject: [PATCH 133/143] lib/stackdepot: rename slab variables Give better names to slab-related global variables: change "depot_" prefix to "slab_" to point out that these variables are related to stack depot slabs. Also rename the slabindex field in handle_parts to align its name with the slab_index global variable. No functional changes. Link: https://lkml.kernel.org/r/fc73ab8b1469d476363a918cbdfe28e1388c043a.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 69b9316b0d4b5..023f299bedf6c 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -56,7 +56,7 @@ union handle_parts { depot_stack_handle_t handle; struct { - u32 slabindex : STACK_ALLOC_INDEX_BITS; + u32 slab_index : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; u32 extra : STACK_DEPOT_EXTRA_BITS; @@ -93,11 +93,11 @@ static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; /* Currently used slab in stack_slabs. */ -static int depot_index; +static int slab_index; /* Offset to the unused space in the currently used slab. */ -static size_t depot_offset; +static size_t slab_offset; /* Lock that protects the variables above. */ -static DEFINE_RAW_SPINLOCK(depot_lock); +static DEFINE_RAW_SPINLOCK(slab_lock); /* Whether the next slab is initialized. */ static int next_slab_inited; @@ -230,13 +230,13 @@ static bool depot_init_slab(void **prealloc) */ if (smp_load_acquire(&next_slab_inited)) return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; + if (stack_slabs[slab_index] == NULL) { + stack_slabs[slab_index] = *prealloc; *prealloc = NULL; } else { /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; + if (slab_index + 1 < STACK_ALLOC_MAX_SLABS) { + stack_slabs[slab_index + 1] = *prealloc; *prealloc = NULL; /* * This smp_store_release pairs with smp_load_acquire() @@ -258,35 +258,35 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + if (unlikely(slab_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(slab_index + 1 >= STACK_ALLOC_MAX_SLABS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } - depot_index++; - depot_offset = 0; + slab_index++; + slab_offset = 0; /* * smp_store_release() here pairs with smp_load_acquire() from * |next_slab_inited| in stack_depot_save() and * depot_init_slab(). */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) + if (slab_index + 1 < STACK_ALLOC_MAX_SLABS) smp_store_release(&next_slab_inited, 0); } depot_init_slab(prealloc); - if (stack_slabs[depot_index] == NULL) + if (stack_slabs[slab_index] == NULL) return NULL; - stack = stack_slabs[depot_index] + depot_offset; + stack = stack_slabs[slab_index] + slab_offset; stack->hash = hash; stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + stack->handle.slab_index = slab_index; + stack->handle.offset = slab_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; + slab_offset += required_size; return stack; } @@ -418,7 +418,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, prealloc = page_address(page); } - raw_spin_lock_irqsave(&depot_lock, flags); + raw_spin_lock_irqsave(&slab_lock, flags); found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { @@ -441,7 +441,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, WARN_ON(!depot_init_slab(&prealloc)); } - raw_spin_unlock_irqrestore(&depot_lock, flags); + raw_spin_unlock_irqrestore(&slab_lock, flags); exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ @@ -497,12 +497,12 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.slabindex > depot_index) { + if (parts.slab_index > slab_index) { WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); + parts.slab_index, slab_index, handle); return 0; } - slab = stack_slabs[parts.slabindex]; + slab = stack_slabs[parts.slab_index]; if (!slab) return 0; stack = slab + offset; From c044a1e2bccec2e7d1fe13123022985a30f2bc81 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:36 +0100 Subject: [PATCH 134/143] lib/stackdepot: rename handle and slab constants Change the "STACK_ALLOC_" prefix to "DEPOT_" for the constants that define the number of bits in stack depot handles and the maximum number of slabs. The old prefix is unclear and makes wonder about how these constants are related to stack allocations. The new prefix is also shorter. Also simplify the comment for DEPOT_SLAB_ORDER. No functional changes. Link: https://lkml.kernel.org/r/d9c6d1fa0ae6e1e65577ee81444656c99eb598d8.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 56 +++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 023f299bedf6c..b946ba74fea08 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -36,30 +36,28 @@ #include #include -#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) - -#define STACK_ALLOC_NULL_PROTECTION_BITS 1 -#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ -#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) -#define STACK_ALLOC_ALIGN 4 -#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ - STACK_ALLOC_ALIGN) -#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - \ - STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) -#define STACK_ALLOC_SLABS_CAP 8192 -#define STACK_ALLOC_MAX_SLABS \ - (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ - (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) + +#define DEPOT_VALID_BITS 1 +#define DEPOT_SLAB_ORDER 2 /* Slab size order, 4 pages */ +#define DEPOT_SLAB_SIZE (1LL << (PAGE_SHIFT + DEPOT_SLAB_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_SLAB_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_SLAB_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \ + DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) +#define DEPOT_SLABS_CAP 8192 +#define DEPOT_MAX_SLABS \ + (((1LL << (DEPOT_SLAB_INDEX_BITS)) < DEPOT_SLABS_CAP) ? \ + (1LL << (DEPOT_SLAB_INDEX_BITS)) : DEPOT_SLABS_CAP) /* The compact structure to store the reference to stacks. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 slab_index : STACK_ALLOC_INDEX_BITS; - u32 offset : STACK_ALLOC_OFFSET_BITS; - u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 slab_index : DEPOT_SLAB_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 valid : DEPOT_VALID_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -91,7 +89,7 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; +static void *stack_slabs[DEPOT_MAX_SLABS]; /* Currently used slab in stack_slabs. */ static int slab_index; /* Offset to the unused space in the currently used slab. */ @@ -235,7 +233,7 @@ static bool depot_init_slab(void **prealloc) *prealloc = NULL; } else { /* If this is the last depot slab, do not touch the next one. */ - if (slab_index + 1 < STACK_ALLOC_MAX_SLABS) { + if (slab_index + 1 < DEPOT_MAX_SLABS) { stack_slabs[slab_index + 1] = *prealloc; *prealloc = NULL; /* @@ -256,10 +254,10 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) struct stack_record *stack; size_t required_size = struct_size(stack, entries, size); - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); - if (unlikely(slab_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(slab_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + if (unlikely(slab_offset + required_size > DEPOT_SLAB_SIZE)) { + if (unlikely(slab_index + 1 >= DEPOT_MAX_SLABS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } @@ -270,7 +268,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) * |next_slab_inited| in stack_depot_save() and * depot_init_slab(). */ - if (slab_index + 1 < STACK_ALLOC_MAX_SLABS) + if (slab_index + 1 < DEPOT_MAX_SLABS) smp_store_release(&next_slab_inited, 0); } depot_init_slab(prealloc); @@ -282,7 +280,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->hash = hash; stack->size = size; stack->handle.slab_index = slab_index; - stack->handle.offset = slab_offset >> STACK_ALLOC_ALIGN; + stack->handle.offset = slab_offset >> DEPOT_STACK_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); @@ -413,7 +411,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); alloc_flags |= __GFP_NOWARN; - page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); + page = alloc_pages(alloc_flags, DEPOT_SLAB_ORDER); if (page) prealloc = page_address(page); } @@ -445,7 +443,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ - free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); + free_pages((unsigned long)prealloc, DEPOT_SLAB_ORDER); } if (found) retval.handle = found->handle.handle; @@ -490,7 +488,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, { union handle_parts parts = { .handle = handle }; void *slab; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; + size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; *entries = NULL; From 448e465b8d94a97e01fabab27fa09f653abb0f8b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:37 +0100 Subject: [PATCH 135/143] lib/stacktrace: drop impossible WARN_ON for depot_init_slab depot_init_slab has two call sites: 1. In depot_alloc_stack with a potentially NULL prealloc. 2. In __stack_depot_save with a non-NULL prealloc. At the same time depot_init_slab can only return false when prealloc is NULL. As the second call site makes sure that prealloc is not NULL, the WARN_ON there can never trigger. Thus, drop the WARN_ON and also move the prealloc check from depot_init_slab to its first call site. Also change the return type of depot_init_slab to void as it now always returns true. Link: https://lkml.kernel.org/r/7e7434a0d4e8a71138aec2c8a3c69a4eebf49935.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b946ba74fea08..d6be82a5c2234 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,16 +218,14 @@ int stack_depot_init(void) } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool depot_init_slab(void **prealloc) +static void depot_init_slab(void **prealloc) { - if (!*prealloc) - return false; /* * This smp_load_acquire() pairs with smp_store_release() to * |next_slab_inited| below and in depot_alloc_stack(). */ if (smp_load_acquire(&next_slab_inited)) - return true; + return; if (stack_slabs[slab_index] == NULL) { stack_slabs[slab_index] = *prealloc; *prealloc = NULL; @@ -244,7 +242,6 @@ static bool depot_init_slab(void **prealloc) smp_store_release(&next_slab_inited, 1); } } - return true; } /* Allocation of a new stack in raw storage */ @@ -271,7 +268,8 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) if (slab_index + 1 < DEPOT_MAX_SLABS) smp_store_release(&next_slab_inited, 0); } - depot_init_slab(prealloc); + if (*prealloc) + depot_init_slab(prealloc); if (stack_slabs[slab_index] == NULL) return NULL; @@ -436,7 +434,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!depot_init_slab(&prealloc)); + depot_init_slab(&prealloc); } raw_spin_unlock_irqrestore(&slab_lock, flags); From 9ac8093101324a5cfda1988a363a2943114d9465 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:38 +0100 Subject: [PATCH 136/143] lib/stackdepot: annotate depot_init_slab and depot_alloc_stack Clean up the exisiting comments and add new ones to depot_init_slab and depot_alloc_stack. As a part of the clean-up, remove mentions of which variable is accessed by smp_store_release and smp_load_acquire: it is clear as is from the code. Link: https://lkml.kernel.org/r/3cf68a23da43cef0f8737f7a2c07f35ce6d841a7.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index d6be82a5c2234..7282565722f29 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,33 +218,41 @@ int stack_depot_init(void) } EXPORT_SYMBOL_GPL(stack_depot_init); +/* Uses preallocated memory to initialize a new stack depot slab. */ static void depot_init_slab(void **prealloc) { /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). + * If the next slab is already initialized, do not use the + * preallocated memory. + * smp_load_acquire() here pairs with smp_store_release() below and + * in depot_alloc_stack(). */ if (smp_load_acquire(&next_slab_inited)) return; + + /* Check if the current slab is not yet allocated. */ if (stack_slabs[slab_index] == NULL) { + /* Use the preallocated memory for the current slab. */ stack_slabs[slab_index] = *prealloc; *prealloc = NULL; } else { - /* If this is the last depot slab, do not touch the next one. */ + /* + * Otherwise, use the preallocated memory for the next slab + * as long as we do not exceed the maximum number of slabs. + */ if (slab_index + 1 < DEPOT_MAX_SLABS) { stack_slabs[slab_index + 1] = *prealloc; *prealloc = NULL; /* * This smp_store_release pairs with smp_load_acquire() - * from |next_slab_inited| above and in - * stack_depot_save(). + * above and in stack_depot_save(). */ smp_store_release(&next_slab_inited, 1); } } } -/* Allocation of a new stack in raw storage */ +/* Allocates a new stack in a stack depot slab. */ static struct stack_record * depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { @@ -253,28 +261,35 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); + /* Check if there is not enough space in the current slab. */ if (unlikely(slab_offset + required_size > DEPOT_SLAB_SIZE)) { + /* Bail out if we reached the slab limit. */ if (unlikely(slab_index + 1 >= DEPOT_MAX_SLABS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } + + /* Move on to the next slab. */ slab_index++; slab_offset = 0; /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * depot_init_slab(). + * smp_store_release() here pairs with smp_load_acquire() in + * stack_depot_save() and depot_init_slab(). */ if (slab_index + 1 < DEPOT_MAX_SLABS) smp_store_release(&next_slab_inited, 0); } + + /* Assign the preallocated memory to a slab if required. */ if (*prealloc) depot_init_slab(prealloc); + + /* Check if we have a slab to save the stack trace. */ if (stack_slabs[slab_index] == NULL) return NULL; + /* Save the stack trace. */ stack = stack_slabs[slab_index] + slab_offset; - stack->hash = hash; stack->size = size; stack->handle.slab_index = slab_index; From 99b16eb24defdee273993a86f65359f7b0482b2b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:39 +0100 Subject: [PATCH 137/143] lib/stacktrace, kasan, kmsan: rework extra_bits interface The current implementation of the extra_bits interface is confusing: passing extra_bits to __stack_depot_save makes it seem that the extra bits are somehow stored in stack depot. In reality, they are only embedded into a stack depot handle and are not used within stack depot. Drop the extra_bits argument from __stack_depot_save and instead provide a new stack_depot_set_extra_bits function (similar to the exsiting stack_depot_get_extra_bits) that saves extra bits into a stack depot handle. Update the callers of __stack_depot_save to use the new interace. This change also fixes a minor issue in the old code: __stack_depot_save does not return NULL if saving stack trace fails and extra_bits is used. Link: https://lkml.kernel.org/r/fbe58d38b7d93a9ef8500a72c0c4f103222418e6.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 4 +++- lib/stackdepot.c | 38 +++++++++++++++++++++++++++++--------- mm/kasan/common.c | 2 +- mm/kmsan/core.c | 10 +++++++--- 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c4e3abc16b16a..f999811c66d78 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -57,7 +57,6 @@ static inline int stack_depot_early_init(void) { return 0; } depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); depot_stack_handle_t stack_depot_save(unsigned long *entries, @@ -71,6 +70,9 @@ void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, + unsigned int extra_bits); + unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 7282565722f29..f291ad6a4e72d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -346,7 +346,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * * @entries: Pointer to storage array * @nr_entries: Size of the storage array - * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack slabs (increased chance of failure if false) * @@ -358,10 +357,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * - * Additional opaque flags can be passed in @extra_bits, stored in the unused - * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() - * without calling stack_depot_fetch(). - * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -371,7 +366,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; @@ -461,8 +455,6 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, if (found) retval.handle = found->handle.handle; fast_exit: - retval.extra = extra_bits; - return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -483,7 +475,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); @@ -566,6 +558,34 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, } EXPORT_SYMBOL_GPL(stack_depot_snprint); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ +depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, + unsigned int extra_bits) +{ + union handle_parts parts = { .handle = handle }; + + parts.extra = extra_bits; + return parts.handle; +} +EXPORT_SYMBOL(stack_depot_set_extra_bits); + +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 833bf2cfd2a39..50f4338b477f2 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 112dce135c7f6..f710257d68670 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -69,13 +69,15 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, { unsigned long entries[KMSAN_STACK_DEPTH]; unsigned int nr_entries; + depot_stack_handle_t handle; nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ flags &= ~__GFP_DIRECT_RECLAIM; - return __stack_depot_save(entries, nr_entries, extra, flags, true); + handle = __stack_depot_save(entries, nr_entries, flags, true); + return stack_depot_set_extra_bits(handle, extra); } /* Copy the metadata following the memmove() behavior. */ @@ -215,6 +217,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) u32 extra_bits; int depth; bool uaf; + depot_stack_handle_t handle; if (!id) return id; @@ -250,8 +253,9 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * positives when __stack_depot_save() passes it to instrumented code. */ kmsan_internal_unpoison_memory(entries, sizeof(entries), false); - return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, - GFP_ATOMIC, true); + handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC, + true); + return stack_depot_set_extra_bits(handle, extra_bits); } void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, From 77aff6dcfb62d29fc31cc26b3c6b814058180af7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:40 +0100 Subject: [PATCH 138/143] lib/stackdepot: annotate racy slab_index accesses Accesses to slab_index are protected by slab_lock everywhere except in a sanity check in stack_depot_fetch. The read access there can race with the write access in depot_alloc_stack. Use WRITE/READ_ONCE() to annotate the racy accesses. As the sanity check is only used to print a warning in case of a violation of the stack depot interface usage, it does not make a lot of sense to use proper synchronization. Link: https://lkml.kernel.org/r/19512bb03eed27ced5abeb5bd03f9a8381742cb1.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index f291ad6a4e72d..cc2fe8563af41 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -269,8 +269,11 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return NULL; } - /* Move on to the next slab. */ - slab_index++; + /* + * Move on to the next slab. + * WRITE_ONCE annotates a race with stack_depot_fetch. + */ + WRITE_ONCE(slab_index, slab_index + 1); slab_offset = 0; /* * smp_store_release() here pairs with smp_load_acquire() in @@ -492,6 +495,8 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { union handle_parts parts = { .handle = handle }; + /* READ_ONCE annotates a race with depot_alloc_stack. */ + int slab_index_cached = READ_ONCE(slab_index); void *slab; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; @@ -500,9 +505,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.slab_index > slab_index) { + if (parts.slab_index > slab_index_cached) { WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slab_index, slab_index, handle); + parts.slab_index, slab_index_cached, handle); return 0; } slab = stack_slabs[parts.slab_index]; From 67c1f0af3489b9e4f17633ac130d5ba9368b3535 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jan 2023 13:10:50 -0800 Subject: [PATCH 139/143] lib-stackdepot-annotate-racy-slab_index-accesses-fix enhance comment, per Marco Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cc2fe8563af41..fe37b8f896380 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -271,7 +271,8 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) /* * Move on to the next slab. - * WRITE_ONCE annotates a race with stack_depot_fetch. + * WRITE_ONCE pairs with potential concurrent read in + * stack_depot_fetch(). */ WRITE_ONCE(slab_index, slab_index + 1); slab_offset = 0; From d96e3a9632e7539cfc5dc69e7dd207a3d33da9a1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:41 +0100 Subject: [PATCH 140/143] lib/stackdepot: various comments clean-ups Clean up comments in include/linux/stackdepot.h and lib/stackdepot.c: 1. Rework the initialization comment in stackdepot.h. 2. Rework the header comment in stackdepot.c. 3. Various clean-ups for other comments. Also adjust whitespaces for find_stack and depot_alloc_stack call sites. No functional changes. Link: https://lkml.kernel.org/r/e6d509e769bacc1842de868084e157787e6a3817.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 36 +++++------ lib/stackdepot.c | 120 ++++++++++++++++++------------------- 2 files changed, 78 insertions(+), 78 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index f999811c66d78..173740987d8b2 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * A generic stack depot implementation + * Stack depot - a stack trace storage that avoids duplication. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #ifndef _LINUX_STACKDEPOT_H @@ -17,35 +17,37 @@ typedef u32 depot_stack_handle_t; /* * Number of bits in the handle that stack depot doesn't use. Users may store - * information in them. + * information in them via stack_depot_set/get_extra_bits. */ #define STACK_DEPOT_EXTRA_BITS 5 /* - * Every user of stack depot has to call stack_depot_init() during its own init - * when it's decided that it will be calling stack_depot_save() later. This is - * recommended for e.g. modules initialized later in the boot process, when - * slab_is_available() is true. + * Using stack depot requires its initialization, which can be done in 3 ways: * - * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot - * enabled as part of mm_init(), for subsystems where it's known at compile time - * that stack depot will be used. + * 1. Selecting CONFIG_STACKDEPOT_ALWAYS_INIT. This option is suitable in + * scenarios where it's known at compile time that stack depot will be used. + * Enabling this config makes the kernel initialize stack depot in mm_init(). * - * Another alternative is to call stack_depot_request_early_init(), when the - * decision to use stack depot is taken e.g. when evaluating kernel boot - * parameters, which precedes the enablement point in mm_init(). + * 2. Calling stack_depot_request_early_init() during early boot, before + * stack_depot_early_init() in mm_init() completes. For example, this can + * be done when evaluating kernel boot parameters. + * + * 3. Calling stack_depot_init(). Possible after boot is complete. This option + * is recommended for modules initialized later in the boot process, after + * mm_init() completes. * * stack_depot_init() and stack_depot_request_early_init() can be called - * regardless of CONFIG_STACKDEPOT and are no-op when disabled. The actual - * save/fetch/print functions should only be called from code that makes sure - * CONFIG_STACKDEPOT is enabled. + * regardless of whether CONFIG_STACKDEPOT is enabled and are no-op when this + * config is disabled. The save/fetch/print stack depot functions can only be + * called from the code that makes sure CONFIG_STACKDEPOT is enabled _and_ + * initializes stack depot via one of the ways listed above. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); void __init stack_depot_request_early_init(void); -/* This is supposed to be called only from mm_init() */ +/* Must be only called from mm_init(). */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } diff --git a/lib/stackdepot.c b/lib/stackdepot.c index fe37b8f896380..cb0dc6144c97d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -1,22 +1,26 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Generic stack depot for storing stack traces. + * Stack depot - a stack trace storage that avoids duplication. * - * Some debugging tools need to save stack traces of certain events which can - * be later presented to the user. For example, KASAN needs to safe alloc and - * free stacks for each object, but storing two stack traces per object - * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for - * that). + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. * - * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc - * and free stacks repeat a lot, we save about 100x space. - * Stacks are never removed from depot, so we store them contiguously one after - * another in a contiguous memory allocation. + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Internally, stack depot maintains a hash table of unique stacktraces. The + * stack traces themselves are stored contiguously one after another in a set + * of separate page allocations. + * + * Stack traces are never removed from stack depot. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #define pr_fmt(fmt) "stackdepot: " fmt @@ -50,7 +54,7 @@ (((1LL << (DEPOT_SLAB_INDEX_BITS)) < DEPOT_SLABS_CAP) ? \ (1LL << (DEPOT_SLAB_INDEX_BITS)) : DEPOT_SLABS_CAP) -/* The compact structure to store the reference to stacks. */ +/* Compact structure that stores a reference to a stack. */ union handle_parts { depot_stack_handle_t handle; struct { @@ -62,11 +66,11 @@ union handle_parts { }; struct stack_record { - struct stack_record *next; /* Link in the hashtable */ - u32 hash; /* Hash in the hastable */ - u32 size; /* Number of frames in the stack */ + struct stack_record *next; /* Link in the hash table */ + u32 hash; /* Hash in the hash table */ + u32 size; /* Number of stored frames */ union handle_parts handle; - unsigned long entries[]; /* Variable-sized array of entries. */ + unsigned long entries[]; /* Variable-sized array of frames */ }; static bool stack_depot_disabled; @@ -306,7 +310,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return stack; } -/* Calculate hash for a stack */ +/* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { return jhash2((u32 *)entries, @@ -314,9 +318,9 @@ static inline u32 hash_stack(unsigned long *entries, unsigned int size) STACK_HASH_SEED); } -/* Use our own, non-instrumented version of memcmp(). - * - * We actually don't care about the order, just the equality. +/* + * Non-instrumented version of memcmp(). + * Does not check the lexicographical order, only the equality. */ static inline int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, @@ -329,7 +333,7 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, return 0; } -/* Find a stack that is equal to the one stored in entries in the hash */ +/* Finds a stack in a bucket of the hash table. */ static inline struct stack_record *find_stack(struct stack_record *bucket, unsigned long *entries, int size, u32 hash) @@ -346,27 +350,27 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, } /** - * __stack_depot_save - Save a stack trace from an array + * __stack_depot_save - Save a stack trace to stack depot * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags * @can_alloc: Allocate stack slabs (increased chance of failure if false) * * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, is allowed to replenish the stack slab pool in case no space is left + * %true, stack depot can replenish the stack slab pool in case no space is left * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and will fail if no space is left to store the stack trace. + * any allocations and fails if no space is left to store the stack trace. * - * If the stack trace in @entries is from an interrupt, only the portion up to - * interrupt entry is saved. + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently - * this is the case from contexts where neither %GFP_ATOMIC nor + * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * - * Return: The handle of the stack struct stored in depot, 0 on failure. + * Return: Handle of the stack struct stored in depot, 0 on failure */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, @@ -381,11 +385,11 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, /* * If this stack trace is from an interrupt, including anything before - * interrupt entry usually leads to unbounded stackdepot growth. + * interrupt entry usually leads to unbounded stack depot growth. * - * Because use of filter_irq_stacks() is a requirement to ensure - * stackdepot can efficiently deduplicate interrupt stacks, always - * filter_irq_stacks() to simplify all callers' use of stackdepot. + * Since use of filter_irq_stacks() is a requirement to ensure stack + * depot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stack depot. */ nr_entries = filter_irq_stacks(entries, nr_entries); @@ -400,8 +404,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |bucket| below. */ - found = find_stack(smp_load_acquire(bucket), entries, - nr_entries, hash); + found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash); if (found) goto exit; @@ -431,7 +434,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + struct stack_record *new = + depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { new->next = *bucket; @@ -444,8 +448,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, } } else if (prealloc) { /* - * We didn't need to store this stack trace, but let's keep - * the preallocated memory for the future. + * Stack depot already contains this stack trace, but let's + * keep the preallocated memory for the future. */ depot_init_slab(&prealloc); } @@ -453,7 +457,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, raw_spin_unlock_irqrestore(&slab_lock, flags); exit: if (prealloc) { - /* Nobody used this memory, ok to free it. */ + /* Stack depot didn't use this memory, free it. */ free_pages((unsigned long)prealloc, DEPOT_SLAB_ORDER); } if (found) @@ -464,16 +468,16 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, EXPORT_SYMBOL_GPL(__stack_depot_save); /** - * stack_depot_save - Save a stack trace from an array + * stack_depot_save - Save a stack trace to stack depot * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags * * Context: Contexts where allocations via alloc_pages() are allowed. * See __stack_depot_save() for more details. * - * Return: The handle of the stack struct stored in depot, 0 on failure. + * Return: Handle of the stack trace stored in depot, 0 on failure */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, @@ -484,13 +488,12 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, EXPORT_SYMBOL_GPL(stack_depot_save); /** - * stack_depot_fetch - Fetch stack entries from a depot + * stack_depot_fetch - Fetch a stack trace from stack depot * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace * - * Return: The number of trace entries for this depot. + * Return: Number of frames for the fetched stack */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) @@ -522,11 +525,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, EXPORT_SYMBOL_GPL(stack_depot_fetch); /** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). + * stack_depot_print - Print a stack trace from stack depot * + * @stack: Stack depot handle returned from stack_depot_save() */ void stack_depot_print(depot_stack_handle_t stack) { @@ -540,17 +541,14 @@ void stack_depot_print(depot_stack_handle_t stack) EXPORT_SYMBOL_GPL(stack_depot_print); /** - * stack_depot_snprint - print stack entries from a depot into a buffer + * stack_depot_snprint - Print a stack trace from stack depot into a buffer * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). + * @handle: Stack depot handle returned from stack_depot_save() * @buf: Pointer to the print buffer - * * @size: Size of the print buffer - * * @spaces: Number of leading spaces to print * - * Return: Number of bytes printed. + * Return: Number of bytes printed */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces) From ec839ec28aefb66d704efbabfbefb703d36d9077 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 30 Jan 2023 21:49:42 +0100 Subject: [PATCH 141/143] lib/stackdepot: move documentation comments to stackdepot.h Move all interface- and usage-related documentation comments to include/linux/stackdepot.h. It makes sense to have them in the header where they are available to the interface users. Link: https://lkml.kernel.org/r/341353394ec1134c5a92a2b298348ddc4c48c8a0.1675111415.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 87 ++++++++++++++++++++++++++++++++++++++ lib/stackdepot.c | 87 -------------------------------------- 2 files changed, 87 insertions(+), 87 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 173740987d8b2..a828fbece1ba3 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -2,6 +2,17 @@ /* * Stack depot - a stack trace storage that avoids duplication. * + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. + * + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Stack traces are never removed from stack depot. + * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * @@ -57,24 +68,100 @@ static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +/** + * __stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * @can_alloc: Allocate stack slabs (increased chance of failure if false) + * + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, stack depot can replenish the stack slab pool in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and fails if no space is left to store the stack trace. + * + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case for contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: Handle of the stack struct stored in depot, 0 on failure + */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +/** + * stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: Handle of the stack trace stored in depot, 0 on failure + */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * stack_depot_fetch - Fetch a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace + * + * Return: Number of frames for the fetched stack + */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +/** + * stack_depot_print - Print a stack trace from stack depot + * + * @stack: Stack depot handle returned from stack_depot_save() + */ void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_snprint - Print a stack trace from stack depot into a buffer + * + * @handle: Stack depot handle returned from stack_depot_save() + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed + */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, unsigned int extra_bits); +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cb0dc6144c97d..d2ab4d29fa1a3 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -2,21 +2,10 @@ /* * Stack depot - a stack trace storage that avoids duplication. * - * Stack depot is intended to be used by subsystems that need to store and - * later retrieve many potentially duplicated stack traces without wasting - * memory. - * - * For example, KASAN needs to save allocation and free stack traces for each - * object. Storing two stack traces per object requires a lot of memory (e.g. - * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free - * stack traces often repeat, using stack depot allows to save about 100x space. - * * Internally, stack depot maintains a hash table of unique stacktraces. The * stack traces themselves are stored contiguously one after another in a set * of separate page allocations. * - * Stack traces are never removed from stack depot. - * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * @@ -349,29 +338,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * __stack_depot_save - Save a stack trace to stack depot - * - * @entries: Pointer to the stack trace - * @nr_entries: Number of frames in the stack - * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack slabs (increased chance of failure if false) - * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack slab pool in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. - * - * If the provided stack trace comes from the interrupt context, only the part - * up to the interrupt entry is saved. - * - * Context: Any context, but setting @can_alloc to %false is required if - * alloc_pages() cannot be used from the current context. Currently - * this is the case for contexts where neither %GFP_ATOMIC nor - * %GFP_NOWAIT can be used (NMI, raw_spin_lock). - * - * Return: Handle of the stack struct stored in depot, 0 on failure - */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags, bool can_alloc) @@ -467,18 +433,6 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, } EXPORT_SYMBOL_GPL(__stack_depot_save); -/** - * stack_depot_save - Save a stack trace to stack depot - * - * @entries: Pointer to the stack trace - * @nr_entries: Number of frames in the stack - * @alloc_flags: Allocation GFP flags - * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. - * - * Return: Handle of the stack trace stored in depot, 0 on failure - */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) @@ -487,14 +441,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_depot_save); -/** - * stack_depot_fetch - Fetch a stack trace from stack depot - * - * @handle: Stack depot handle returned from stack_depot_save() - * @entries: Pointer to store the address of the stack trace - * - * Return: Number of frames for the fetched stack - */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { @@ -524,11 +470,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, } EXPORT_SYMBOL_GPL(stack_depot_fetch); -/** - * stack_depot_print - Print a stack trace from stack depot - * - * @stack: Stack depot handle returned from stack_depot_save() - */ void stack_depot_print(depot_stack_handle_t stack) { unsigned long *entries; @@ -540,16 +481,6 @@ void stack_depot_print(depot_stack_handle_t stack) } EXPORT_SYMBOL_GPL(stack_depot_print); -/** - * stack_depot_snprint - Print a stack trace from stack depot into a buffer - * - * @handle: Stack depot handle returned from stack_depot_save() - * @buf: Pointer to the print buffer - * @size: Size of the print buffer - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed - */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces) { @@ -562,17 +493,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, } EXPORT_SYMBOL_GPL(stack_depot_snprint); -/** - * stack_depot_set_extra_bits - Set extra bits in a stack depot handle - * - * @handle: Stack depot handle - * @extra_bits: Value to set the extra bits - * - * Return: Stack depot handle with extra bits set - * - * Stack depot handles have a few unused bits, which can be used for storing - * user-specific information. These bits are transparent to the stack depot. - */ depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, unsigned int extra_bits) { @@ -583,13 +503,6 @@ depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, } EXPORT_SYMBOL(stack_depot_set_extra_bits); -/** - * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle - * - * @handle: Stack depot handle with extra bits saved - * - * Return: Extra bits retrieved from the stack depot handle - */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; From 666b9cffb9da1c8382073d3830a838a31912f347 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Wed, 1 Feb 2023 17:24:35 +0100 Subject: [PATCH 142/143] arch/Kconfig: fix indentation The convention for indentation seems to be a single tab. Help text is further indented by an additional two whitespaces. Fix the lines that violate these rules. Link: https://lkml.kernel.org/r/20230201162435.218368-1-juerg.haefliger@canonical.com Signed-off-by: Juerg Haefliger Reviewed-by: Kees Cook Cc: Dan Li Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Josh Poimboeuf Cc: Juerg Haefliger Cc: Marco Elver Cc: Mark Rutland Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Tom Rix Signed-off-by: Andrew Morton --- arch/Kconfig | 128 +++++++++++++++++++++++++-------------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 12e3ddabac9d3..e3511afbb7f2f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -35,7 +35,7 @@ config HOTPLUG_SMT bool config GENERIC_ENTRY - bool + bool config KPROBES bool "Kprobes" @@ -55,26 +55,26 @@ config JUMP_LABEL depends on HAVE_ARCH_JUMP_LABEL select OBJTOOL if HAVE_JUMP_LABEL_HACK help - This option enables a transparent branch optimization that - makes certain almost-always-true or almost-always-false branch - conditions even cheaper to execute within the kernel. + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. - Certain performance-sensitive kernel code, such as trace points, - scheduler functionality, networking code and KVM have such - branches and include support for this optimization technique. + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. - If it is detected that the compiler has support for "asm goto", - the kernel will compile such branches with just a nop - instruction. When the condition flag is toggled to true, the - nop will be converted to a jump instruction to execute the - conditional block of instructions. + If it is detected that the compiler has support for "asm goto", + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. - This technique lowers overhead and stress on the branch prediction - of the processor and generally makes the kernel faster. The update - of the condition is slower, but those are always very rare. + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. - ( On 32-bit x86, the necessary options added to the compiler - flags may increase the size of the kernel slightly. ) + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config STATIC_KEYS_SELFTEST bool "Static key selftest" @@ -98,9 +98,9 @@ config KPROBES_ON_FTRACE depends on KPROBES && HAVE_KPROBES_ON_FTRACE depends on DYNAMIC_FTRACE_WITH_REGS help - If function tracer is enabled and the arch supports full - passing of pt_regs to function tracing, then kprobes can - optimize on top of function tracing. + If function tracer is enabled and the arch supports full + passing of pt_regs to function tracing, then kprobes can + optimize on top of function tracing. config UPROBES def_bool n @@ -154,21 +154,21 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS config ARCH_USE_BUILTIN_BSWAP bool help - Modern versions of GCC (since 4.4) have builtin functions - for handling byte-swapping. Using these, instead of the old - inline assembler that the architecture code provides in the - __arch_bswapXX() macros, allows the compiler to see what's - happening and offers more opportunity for optimisation. In - particular, the compiler will be able to combine the byteswap - with a nearby load or store and use load-and-swap or - store-and-swap instructions if the architecture has them. It - should almost *never* result in code which is worse than the - hand-coded assembler in . But just in case it - does, the use of the builtins is optional. + Modern versions of GCC (since 4.4) have builtin functions + for handling byte-swapping. Using these, instead of the old + inline assembler that the architecture code provides in the + __arch_bswapXX() macros, allows the compiler to see what's + happening and offers more opportunity for optimisation. In + particular, the compiler will be able to combine the byteswap + with a nearby load or store and use load-and-swap or + store-and-swap instructions if the architecture has them. It + should almost *never* result in code which is worse than the + hand-coded assembler in . But just in case it + does, the use of the builtins is optional. - Any architecture with load-and-swap or store-and-swap - instructions should set this. And it shouldn't hurt to set it - on architectures that don't have such instructions. + Any architecture with load-and-swap or store-and-swap + instructions should set this. And it shouldn't hurt to set it + on architectures that don't have such instructions. config KRETPROBES def_bool y @@ -720,13 +720,13 @@ config LTO_CLANG_FULL depends on !COMPILE_TEST select LTO_CLANG help - This option enables Clang's full Link Time Optimization (LTO), which - allows the compiler to optimize the kernel globally. If you enable - this option, the compiler generates LLVM bitcode instead of ELF - object files, and the actual compilation from bitcode happens at - the LTO link step, which may take several minutes depending on the - kernel configuration. More information can be found from LLVM's - documentation: + This option enables Clang's full Link Time Optimization (LTO), which + allows the compiler to optimize the kernel globally. If you enable + this option, the compiler generates LLVM bitcode instead of ELF + object files, and the actual compilation from bitcode happens at + the LTO link step, which may take several minutes depending on the + kernel configuration. More information can be found from LLVM's + documentation: https://llvm.org/docs/LinkTimeOptimization.html @@ -1330,9 +1330,9 @@ config ARCH_HAS_CC_PLATFORM bool config HAVE_SPARSE_SYSCALL_NR - bool - help - An architecture should select this if its syscall numbering is sparse + bool + help + An architecture should select this if its syscall numbering is sparse to save space. For example, MIPS architecture has a syscall array with entries at 4000, 5000 and 6000 locations. This option turns on syscall related optimizations for a given architecture. @@ -1356,35 +1356,35 @@ config HAVE_PREEMPT_DYNAMIC_CALL depends on HAVE_STATIC_CALL select HAVE_PREEMPT_DYNAMIC help - An architecture should select this if it can handle the preemption - model being selected at boot time using static calls. + An architecture should select this if it can handle the preemption + model being selected at boot time using static calls. - Where an architecture selects HAVE_STATIC_CALL_INLINE, any call to a - preemption function will be patched directly. + Where an architecture selects HAVE_STATIC_CALL_INLINE, any call to a + preemption function will be patched directly. - Where an architecture does not select HAVE_STATIC_CALL_INLINE, any - call to a preemption function will go through a trampoline, and the - trampoline will be patched. + Where an architecture does not select HAVE_STATIC_CALL_INLINE, any + call to a preemption function will go through a trampoline, and the + trampoline will be patched. - It is strongly advised to support inline static call to avoid any - overhead. + It is strongly advised to support inline static call to avoid any + overhead. config HAVE_PREEMPT_DYNAMIC_KEY bool depends on HAVE_ARCH_JUMP_LABEL select HAVE_PREEMPT_DYNAMIC help - An architecture should select this if it can handle the preemption - model being selected at boot time using static keys. + An architecture should select this if it can handle the preemption + model being selected at boot time using static keys. - Each preemption function will be given an early return based on a - static key. This should have slightly lower overhead than non-inline - static calls, as this effectively inlines each trampoline into the - start of its callee. This may avoid redundant work, and may - integrate better with CFI schemes. + Each preemption function will be given an early return based on a + static key. This should have slightly lower overhead than non-inline + static calls, as this effectively inlines each trampoline into the + start of its callee. This may avoid redundant work, and may + integrate better with CFI schemes. - This will have greater overhead than using inline static calls as - the call to the preemption function cannot be entirely elided. + This will have greater overhead than using inline static calls as + the call to the preemption function cannot be entirely elided. config ARCH_WANT_LD_ORPHAN_WARN bool @@ -1407,8 +1407,8 @@ config ARCH_SUPPORTS_PAGE_TABLE_CHECK config ARCH_SPLIT_ARG64 bool help - If a 32-bit architecture requires 64-bit arguments to be split into - pairs of 32-bit arguments, select this option. + If a 32-bit architecture requires 64-bit arguments to be split into + pairs of 32-bit arguments, select this option. config ARCH_HAS_ELFCORE_COMPAT bool From 5ccccb390a57889324a59c78383c37b75a117b43 Mon Sep 17 00:00:00 2001 From: fuyuanli Date: Wed, 1 Feb 2023 21:54:33 +0800 Subject: [PATCH 143/143] hung_task: print message when hung_task_warnings gets down to zero. It's useful to report it when hung_task_warnings gets down to zero, so that we can know if kernel log was lost or there is no hung task was detected. Link: https://lkml.kernel.org/r/20230201135416.GA6560@didi-ThinkCentre-M920t-N000 Signed-off-by: fuyuanli Reviewed-by: Petr Mladek Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- kernel/hung_task.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c71889f3f3fc2..322813366c6c6 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -142,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; + if (!sysctl_hung_task_warnings) + pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } touch_nmi_watchdog();