From acfac37851e01b40c30a7afd0d93ad8db8914f25 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 7 Oct 2022 12:59:20 -0700 Subject: [PATCH 01/29] mm/hugetlb.c: make __hugetlb_vma_unlock_write_put() static Reported-by: kernel test robot Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ad53ad98e742..41d3aa0778373 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6804,7 +6804,7 @@ void hugetlb_vma_lock_release(struct kref *kref) kfree(vma_lock); } -void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; From 7efc3b7261030da79001c00d92bc3392fd6c664c Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 13 Jul 2022 14:20:09 +0800 Subject: [PATCH 02/29] mm/compaction: fix set skip in fast_find_migrateblock When we successfully find a pageblock in fast_find_migrateblock(), the block will be set skip-flag through set_pageblock_skip(). However, when entering isolate_migratepages_block(), the whole pageblock will be skipped due to the branch 'if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages))'. Eventually we will goto isolate_abort and isolate nothing. That makes fast_find_migrateblock useless. In this patch, when we find a suitable pageblock in fast_find_migrateblock, we do noting but let isolate_migratepages_block to set skip flag to the pageblock after scan it. Normally, we would isolate some pages from the fast-find block. I use mmtest/thpscale-madvhugepage test it. Here is the result: baseline patch Amean fault-both-1 1331.66 ( 0.00%) 1261.04 * 5.30%* Amean fault-both-3 1383.95 ( 0.00%) 1191.69 * 13.89%* Amean fault-both-5 1568.13 ( 0.00%) 1445.20 * 7.84%* Amean fault-both-7 1819.62 ( 0.00%) 1555.13 * 14.54%* Amean fault-both-12 1106.96 ( 0.00%) 1149.43 * -3.84%* Amean fault-both-18 2196.93 ( 0.00%) 1875.77 * 14.62%* Amean fault-both-24 2642.69 ( 0.00%) 2671.21 * -1.08%* Amean fault-both-30 2901.89 ( 0.00%) 2857.32 * 1.54%* Amean fault-both-32 3747.00 ( 0.00%) 3479.23 * 7.15%* Link: https://lkml.kernel.org/r/20220713062009.597255-1-zhouchuyi@bytedance.com Fixes: 70b44595eafe9 ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: zhouchuyi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index e2a9615f5fded..c4e4453187a2c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1851,7 +1851,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } From 92b7399695a5cc961c44fc6e4624d3bc3c699ee7 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 11 Oct 2022 20:36:51 +0000 Subject: [PATCH 03/29] mmap: fix copy_vma() failure path The anon vma was not unlinked and the file was not closed in the failure path when the machine runs out of memory during the maple tree modification. This caused a memory leak of the anon vma chain and vma since neither would be freed. Link: https://lkml.kernel.org/r/20221011203621.1446507-1-Liam.Howlett@oracle.com Fixes: 524e00b36e8c ("mm: remove rb tree") Signed-off-by: Liam R. Howlett Reported-by: Lukas Bulwahn Tested-by: Lukas Bulwahn Signed-off-by: Andrew Morton --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index 6e447544f07dd..fc8581cefef71 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3240,6 +3240,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_vma_link: if (new_vma->vm_ops && new_vma->vm_ops->close) new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: From 7be1c1a3c7b13fb259bb5159662a7b83622013b8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 11 Oct 2022 20:55:31 +0300 Subject: [PATCH 04/29] mm: more vma cache removal Link: https://lkml.kernel.org/r/Y0WuE3Riv4iy5Jx8@localhost.localdomain Fixes: 7964cf8caa4d ("mm: remove vmacache") Signed-off-by: Alexey Dobriyan Acked-by: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 88a043f7235eb..e0bb85cf8bdd1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,8 +861,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; - /* Per-thread vma caching: */ - #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif From 28c5609fb236807910ca347ad3e26c4567998526 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 11 Oct 2022 16:08:37 +0000 Subject: [PATCH 05/29] mm/mmap: preallocate maple nodes for brk vma expansion If the brk VMA is the last vma in a maple node and meets the rare criteria that it can be expanded, then preallocation is necessary to avoid a potential fs_reclaim circular lock issue on low resources. At the same time use the actual vma start address (unaligned) when calling vma_adjust_trans_huge(). Link: https://lkml.kernel.org/r/20221011160624.1253454-1-Liam.Howlett@oracle.com Fixes: 2e7ce7d354f2 (mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()) Signed-off-by: Liam R. Howlett Reported-by: Yu Zhao Signed-off-by: Andrew Morton --- mm/mmap.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index fc8581cefef71..5855f26639f98 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2942,17 +2942,18 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { - mas->index = vma->vm_start; - mas->last = addr + len - 1; - vma_adjust_trans_huge(vma, addr, addr + len, 0); + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); if (vma->anon_vma) { anon_vma_lock_write(vma->anon_vma); anon_vma_interval_tree_pre_update_vma(vma); } vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; - if (mas_store_gfp(mas, vma, GFP_KERNEL)) - goto mas_expand_failed; + mas_store_prealloc(mas, vma); if (vma->anon_vma) { anon_vma_interval_tree_post_update_vma(vma); @@ -2993,13 +2994,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, vma_alloc_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; - -mas_expand_failed: - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } - return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) From 515778e2d790652a38a24554fdb7f21420d91efc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Sep 2022 20:25:55 -0400 Subject: [PATCH 06/29] mm/uffd: fix warning without PTE_MARKER_UFFD_WP compiled in When PTE_MARKER_UFFD_WP not configured, it's still possible to reach pte marker code and trigger an warning. Add a few CONFIG_PTE_MARKER_UFFD_WP ifdefs to make sure the code won't be reached when not compiled in. Link: https://lkml.kernel.org/r/YzeR+R6b4bwBlBHh@x1n Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Reported-by: Cc: Axel Rasmussen Cc: Brian Geffon Cc: Edward Liaw Cc: Liu Shixin Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++++ mm/memory.c | 2 ++ mm/mprotect.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41d3aa0778373..9a910612336da 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to @@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); else +#endif huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; @@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); diff --git a/mm/memory.c b/mm/memory.c index df678fa30cdb9..2c7723ea43714 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, diff --git a/mm/mprotect.c b/mm/mprotect.c index 461dcbd4f21a6..668bfaa6ed2ae 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { /* * For file-backed mem, we need to be able to @@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, make_pte_marker(PTE_MARKER_UFFD_WP)); pages++; } +#endif } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); From deb0f6562884b5b4beb883d73e66a7d3a1b96d99 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Fri, 30 Sep 2022 00:38:43 +0000 Subject: [PATCH 07/29] mm/mmap: undo ->mmap() when arch_validate_flags() fails Commit c462ac288f2c ("mm: Introduce arch_validate_flags()") added a late check in mmap_region() to let architectures validate vm_flags. The check needs to happen after calling ->mmap() as the flags can potentially be modified during this callback. If arch_validate_flags() check fails we unmap and free the vma. However, the error path fails to undo the ->mmap() call that previously succeeded and depending on the specific ->mmap() implementation this translates to reference increments, memory allocations and other operations what will not be cleaned up. There are several places (mainly device drivers) where this is an issue. However, one specific example is bpf_map_mmap() which keeps count of the mappings in map->writecnt. The count is incremented on ->mmap() and then decremented on vm_ops->close(). When arch_validate_flags() fails this count is off since bpf_map_mmap_close() is never called. One can reproduce this issue in arm64 devices with MTE support. Here the vm_flags are checked to only allow VM_MTE if VM_MTE_ALLOWED has been set previously. From userspace then is enough to pass the PROT_MTE flag to mmap() syscall to trigger the arch_validate_flags() failure. The following program reproduces this issue: #include #include #include #include #include int main(void) { union bpf_attr attr = { .map_type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), .value_size = sizeof(long long), .max_entries = 256, .map_flags = BPF_F_MMAPABLE, }; int fd; fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); mmap(NULL, 4096, PROT_WRITE | PROT_MTE, MAP_SHARED, fd, 0); return 0; } By manually adding some log statements to the vm_ops callbacks we can confirm that when passing PROT_MTE to mmap() the map->writecnt is off upon ->release(): With PROT_MTE flag: root@debian:~# ./bpf-test [ 111.263874] bpf_map_write_active_inc: map=9 writecnt=1 [ 111.288763] bpf_map_release: map=9 writecnt=1 Without PROT_MTE flag: root@debian:~# ./bpf-test [ 157.816912] bpf_map_write_active_inc: map=10 writecnt=1 [ 157.830442] bpf_map_write_active_dec: map=10 writecnt=0 [ 157.832396] bpf_map_release: map=10 writecnt=0 This patch fixes the above issue by calling vm_ops->close() when the arch_validate_flags() check fails, after this we can proceed to unmap and free the vma on the error path. Link: https://lkml.kernel.org/r/20220930003844.1210987-1-cmllamas@google.com Fixes: c462ac288f2c ("mm: Introduce arch_validate_flags()") Signed-off-by: Carlos Llamas Reviewed-by: Catalin Marinas Acked-by: Andrii Nakryiko Reviewed-by: Liam Howlett Cc: Christian Brauner (Microsoft) Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: [5.10+] Signed-off-by: Andrew Morton --- mm/mmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5855f26639f98..bf2122af94e7a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2673,7 +2673,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; if (file) - goto unmap_and_free_vma; + goto close_and_free_vma; else goto free_vma; } @@ -2742,6 +2742,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, validate_mm(mm); return addr; +close_and_free_vma: + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); unmap_and_free_vma: fput(vma->vm_file); vma->vm_file = NULL; From 4fa0e3ff217f775cb58d2d6d51820ec519243fb9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 12 Oct 2022 20:34:19 +0100 Subject: [PATCH 08/29] ext4,f2fs: fix readahead of verity data The recent change of page_cache_ra_unbounded() arguments was buggy in the two callers, causing us to readahead the wrong pages. Move the definition of ractl down to after the index is set correctly. This affected performance on configurations that use fs-verity. Link: https://lkml.kernel.org/r/20221012193419.1453558-1-willy@infradead.org Fixes: 73bb49da50cd ("mm/readahead: make page_cache_ra_unbounded take a readahead_control") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Jintao Yin Signed-off-by: Andrew Morton --- fs/ext4/verity.c | 3 ++- fs/f2fs/verity.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b051d19b5c8a0..94442c690ca7d 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -365,13 +365,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 7b8f2b41c29b1..c0733f8670746 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -262,13 +262,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) From ac801e7e252c5588325e3c983c7d4167fc68c024 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 5 Sep 2022 14:24:27 +0200 Subject: [PATCH 09/29] kmsan: unpoison @tlb in arch_tlb_gather_mmu() This is an optimization to reduce stackdepot pressure. struct mmu_gather contains 7 1-bit fields packed into a 32-bit unsigned int value. The remaining 25 bits remain uninitialized and are never used, but KMSAN updates the origin for them in zap_pXX_range() in mm/memory.c, thus creating very long origin chains. This is technically correct, but consumes too much memory. Unpoisoning the whole structure will prevent creating such chains. Link: https://lkml.kernel.org/r/20220905122452.2258262-20-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Liu Shixin Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a71924bd38c0d..add4244e5790d 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb) static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { + /* + * struct mmu_gather contains 7 1-bit fields packed into a 32-bit + * unsigned int value. The remaining 25 bits remain uninitialized + * and are never used, but KMSAN updates the origin for them in + * zap_pXX_range() in mm/memory.c, thus creating very long origin + * chains. This is technically correct, but consumes too much memory. + * Unpoisoning the whole structure will prevent creating such chains. + */ + kmsan_unpoison_memory(tlb, sizeof(*tlb)); tlb->mm = mm; tlb->fullmm = fullmm; From ea091fa53680030881b56520d731e36d3ff6cdd5 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 4 Mar 2022 17:12:15 +0800 Subject: [PATCH 10/29] lib/test_meminit: add checks for the allocation functions alloc_pages(), kmalloc() and vmalloc() are all memory allocation functions which can return NULL when some internal memory failures happen. So it is better to check the return of them to catch the failure in time for better test them. Link: https://lkml.kernel.org/r/tencent_D44A49FFB420EDCCBFB9221C8D14DFE12908@qq.com Signed-off-by: Xiaoke Wang Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- lib/test_meminit.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index c95db11a69064..60e1984c060fa 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -67,17 +67,24 @@ static int __init do_alloc_pages_order(int order, int *total_failures) size_t size = PAGE_SIZE << order; page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); fill_with_garbage(buf, size); __free_pages(page, order); page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); __free_pages(page, order); return 1; +err: + (*total_failures)++; + return 1; } /* Test the page allocator by calling alloc_pages with different orders. */ @@ -100,15 +107,22 @@ static int __init do_kmalloc_size(size_t size, int *total_failures) void *buf; buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; fill_with_garbage(buf, size); kfree(buf); buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); kfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test vmalloc() with given parameters. */ @@ -117,15 +131,22 @@ static int __init do_vmalloc_size(size_t size, int *total_failures) void *buf; buf = vmalloc(size); + if (!buf) + goto err; fill_with_garbage(buf, size); vfree(buf); buf = vmalloc(size); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); vfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test kmalloc()/vmalloc() by allocating objects of different sizes. */ From 652e04464d3944226052c827bdaaf5113b072870 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 27 Sep 2022 08:19:45 +0800 Subject: [PATCH 11/29] mm/damon: move sz_damon_region to damon_sz_region Rename sz_damon_region() to damon_sz_region(), and move it to "include/linux/damon.h", because in many places, we can to use this func. Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 9 ++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index ed5470f50babd..620ada094c3b2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t) return list_first_entry(&t->regions_list, struct damon_region, list); } +static inline unsigned long damon_sz_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} + + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 4de8c7c529794..5b9e0d585aef2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -864,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -static inline unsigned long sz_damon_region(struct damon_region *r) -{ - return r->ar.end - r->ar.start; -} - /* * Merge two adjacent regions into one region */ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { - unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); @@ -904,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && - sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else prev = r; From ab63f63f3885d492e62da55304b0483a2a9e6a7d Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 27 Sep 2022 08:19:46 +0800 Subject: [PATCH 12/29] mm/damon: use damon_sz_region() in appropriate place In many places we can use damon_sz_region() to instead of "r->ar.end - r->ar.start". Link: https://lkml.kernel.org/r/20220927001946.85375-2-xhao@linux.alibaba.com Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 17 ++++++++--------- mm/damon/vaddr.c | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 5b9e0d585aef2..515ac4e52a113 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -490,7 +490,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - sz += r->ar.end - r->ar.start; + sz += damon_sz_region(r); } if (ctx->attrs.min_nr_regions) @@ -673,7 +673,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; - sz = r->ar.end - r->ar.start; + sz = damon_sz_region(r); return s->pattern.min_sz_region <= sz && sz <= s->pattern.max_sz_region && s->pattern.min_nr_accesses <= r->nr_accesses && @@ -701,7 +701,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = r->ar.end - r->ar.start; + unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; @@ -730,14 +730,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c, sz = ALIGN_DOWN(quota->charge_addr_from - r->ar.start, DAMON_MIN_REGION); if (!sz) { - if (r->ar.end - r->ar.start <= - DAMON_MIN_REGION) + if (damon_sz_region(r) <= + DAMON_MIN_REGION) continue; sz = DAMON_MIN_REGION; } damon_split_region_at(t, r, sz); r = damon_next_region(r); - sz = r->ar.end - r->ar.start; + sz = damon_sz_region(r); } quota->charge_target_from = NULL; quota->charge_addr_from = 0; @@ -842,8 +842,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) continue; score = c->ops.get_scheme_score( c, t, r, s); - quota->histogram[score] += - r->ar.end - r->ar.start; + quota->histogram[score] += damon_sz_region(r); if (score > max_score) max_score = score; } @@ -957,7 +956,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) int i; damon_for_each_region_safe(r, next, t) { - sz_region = r->ar.end - r->ar.start; + sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && sz_region > 2 * DAMON_MIN_REGION; i++) { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index ea94e0b2c3113..15f03df66db60 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, return -EINVAL; orig_end = r->ar.end; - sz_orig = r->ar.end - r->ar.start; + sz_orig = damon_sz_region(r); sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); if (!sz_piece) @@ -618,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target, { struct mm_struct *mm; unsigned long start = PAGE_ALIGN(r->ar.start); - unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long len = PAGE_ALIGN(damon_sz_region(r)); unsigned long applied; mm = damon_get_mm(target); From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:15 +1000 Subject: [PATCH 13/29] mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: John Hubbard Cc: Ralph Campbell Cc: Michael Ellerman Cc: Lyude Paul Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 +++++++------ drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++----- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +++++--- include/linux/migrate.h | 8 ++++++ lib/test_hmm.c | 7 ++--- mm/memory.c | 16 ++++++++++- mm/migrate.c | 34 ++++++++++++++---------- mm/migrate_device.c | 18 +++++++++---- 9 files changed, 89 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 5980063016207..965c9e9e500bc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page; /* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret; mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock); return ret; @@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true; if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out; - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index b059a77b6081d..776448bd9fe4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; unsigned long cpages = 0; dma_addr_t *scratch; void *buf; @@ -668,7 +668,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, unsigned long cpages = 0; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; dma_addr_t *scratch; void *buf; int r = -ENOMEM; @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.src = buf; migrate.dst = migrate.src + npages; + migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, @@ -766,7 +767,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct amdgpu_device *adev; struct vm_area_struct *vma; @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } next = min(vma->vm_end, end); - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, + fault_page); if (r < 0) { pr_debug("failed %ld to migrate prange %p\n", r, prange); break; @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger); + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) goto out_unlock_prange; } - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); + r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, + vmf->page); if (r) pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, prange, prange->start, prange->last); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index b3f0754b32faa..a5d7e6d222646 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, struct mm_struct *mm, uint32_t trigger); int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger); + uint32_t trigger, struct page *fault_page); unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 11074cc8c333b..9139e5a0b2a07 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2913,13 +2913,15 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, */ if (prange->actual_loc) r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); else r = 0; } } else { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_vram_to_ram(prange, mm, + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_TTM_EVICTION); + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); if (!r && prange->actual_loc) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 704a04f5a0746..52090d1f92307 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, @@ -197,6 +199,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 6a33f6b1b4651..e566166b55712 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, struct vm_area_struct *vma; unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns = 0; unsigned long dst_pfns = 0; struct page *rpage; @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); + args.fault_page = vmf->page; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 2c7723ea43714..4ad6077164cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3750,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_swapin_error_entry(entry)) { diff --git a/mm/migrate.c b/mm/migrate.c index c228afba0963d..1379e1912772e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. @@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - int rc; - - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - - rc = folio_migrate_mapping(mapping, dst, src, 0); - - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return migrate_folio_extra(mapping, dst, src, mode, 0); } EXPORT_SYMBOL(migrate_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ab6ab9d2ed82..8dee38ffcda25 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -405,7 +405,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (page_mapped(page) || + !migrate_vma_check_page(page, migrate->fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); @@ -517,6 +518,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -747,8 +750,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:16 +1000 Subject: [PATCH 14/29] mm: free device private pages have zero refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") device private pages have no longer had an extra reference count when the page is in use. However before handing them back to the owning device driver we add an extra reference count such that free pages have a reference count of one. This makes it difficult to tell if a page is free or not because both free and in use pages will have a non-zero refcount. Instead we should return pages to the drivers page allocator with a zero reference count. Kernel code can then safely use kernel functions such as get_page_unless_zero(). Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 1 + lib/test_hmm.c | 2 +- mm/memremap.c | 9 +++++++++ mm/page_alloc.c | 8 ++++++++ 7 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 965c9e9e500bc..e2f11f9c3f2aa 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -718,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - lock_page(dpage); + zone_device_page_init(dpage); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 776448bd9fe4a..97a684568ae01 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - lock_page(page); + zone_device_page_init(page); } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 16356611b5b95..b092988266a6a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - lock_page(page); + zone_device_page_init(page); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c3b4cc84877b5..7fcaf3180a5b6 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE +void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e566166b55712..bc2b949911653 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,8 +627,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } + zone_device_page_init(dpage); dpage->zone_device_data = rpage; - lock_page(dpage); return dpage; error: diff --git a/mm/memremap.c b/mm/memremap.c index 25029a474d30b..1c2c038f34109 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,8 +505,17 @@ void free_zone_device_page(struct page *page) /* * Reset the page count to 1 to prepare for handing out the page again. */ + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + set_page_count(page, 1); +} + +void zone_device_page_init(struct page *page) +{ set_page_count(page, 1); + lock_page(page); } +EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX bool __put_devmap_managed_page_refs(struct page *page, int refs) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12b6184cbbed6..059f6946832fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6819,6 +6819,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); } /* From 0dc45ca1ce18900572282c4f054bbe78351cb6a7 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:17 +1000 Subject: [PATCH 15/29] mm/memremap.c: take a pgmap reference on page allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ZONE_DEVICE pages have a struct dev_pagemap which is allocated by a driver. When the struct page is first allocated by the kernel in memremap_pages() a reference is taken on the associated pagemap to ensure it is not freed prior to the pages being freed. Prior to 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") pages were considered free and returned to the driver when the reference count dropped to one. However the pagemap reference was not dropped until the page reference count hit zero. This would occur as part of the final put_page() in memunmap_pages() which would wait for all pages to be freed prior to returning. When the extra refcount was removed the pagemap reference was no longer being dropped in put_page(). Instead memunmap_pages() was changed to explicitly drop the pagemap references. This means that memunmap_pages() can complete even though pages are still mapped by the kernel which can lead to kernel crashes, particularly if a driver frees the pagemap. To fix this drivers should take a pagemap reference when allocating the page. This reference can then be returned when the page is freed. Link: https://lkml.kernel.org/r/12d155ec727935ebfbb4d639a03ab374917ea51b.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Fixes: 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") Cc: Jason Gunthorpe Cc: Felix Kuehling Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memremap.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index 1c2c038f34109..421bec3a29ee7 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap) int i; percpu_ref_kill(&pgmap->ref); - for (i = 0; i < pgmap->nr_range; i++) - percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + for (i = 0; i < pgmap->nr_range; i++) + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + wait_for_completion(&pgmap->done); for (i = 0; i < pgmap->nr_range; i++) @@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: @@ -502,16 +507,24 @@ void free_zone_device_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); - /* - * Reset the page count to 1 to prepare for handing out the page again. - */ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && page->pgmap->type != MEMORY_DEVICE_COHERENT) + /* + * Reset the page count to 1 to prepare for handing out the page + * again. + */ set_page_count(page, 1); + else + put_dev_pagemap(page->pgmap); } void zone_device_page_init(struct page *page) { + /* + * Drivers shouldn't be allocating pages after calling + * memunmap_pages(). + */ + WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref)); set_page_count(page, 1); lock_page(page); } From 241f68859656836ae3e85179cc224cc4c5e4e6a7 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:18 +1000 Subject: [PATCH 16/29] mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_device_coherent_page() reuses the existing migrate_vma family of functions to migrate a specific page without providing a valid mapping or vma. This looks a bit odd because it means we are calling migrate_vma_*() without setting a valid vma, however it was considered acceptable at the time because the details were internal to migrate_device.c and there was only a single user. One of the reasons the details could be kept internal was that this was strictly for migrating device coherent memory. Such memory can be copied directly by the CPU without intervention from a driver. However this isn't true for device private memory, and a future change requires similar functionality for device private memory. So refactor the code into something more sensible for migrating device memory without a vma. Link: https://lkml.kernel.org/r/c7b2ff84e9b33d022cf4a40f87d051f281a16d8f.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Cc: Matthew Wilcox Cc: Yang Shi Cc: David Hildenbrand Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Lyude Paul Cc: Michael Ellerman Signed-off-by: Andrew Morton --- mm/migrate_device.c | 150 +++++++++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 65 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8dee38ffcda25..7707c1d898f55 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -357,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) } /* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. + * Unmaps pages for migration. Returns number of unmapped pages. */ -static void migrate_vma_unmap(struct migrate_vma *migrate) +static unsigned long migrate_device_unmap(unsigned long *src_pfns, + unsigned long npages, + struct page *fault_page) { - const unsigned long npages = migrate->npages; unsigned long i, restore = 0; bool allow_drain = true; + unsigned long unmapped = 0; lru_add_drain(); for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; if (!page) @@ -391,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } @@ -406,34 +399,54 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) try_to_migrate(folio, 0); if (page_mapped(page) || - !migrate_vma_check_page(page, migrate->fault_page)) { + !migrate_vma_check_page(page, fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); } - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } + + unmapped++; } for (i = 0; i < npages && restore; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; folio = page_folio(page); remove_migration_ptes(folio, folio, false); - migrate->src[i] = 0; + src_pfns[i] = 0; folio_unlock(folio); folio_put(folio); restore--; } + + return unmapped; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, + migrate->fault_page); } /** @@ -680,41 +693,36 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, *src &= ~MIGRATE_PFN_MIGRATE; } -/** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information - * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. - */ -void migrate_vma_pages(struct migrate_vma *migrate) +static void migrate_device_pages(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages, + struct migrate_vma *migrate) { - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; struct mmu_notifier_range range; - unsigned long addr, i; + unsigned long i; bool notified = false; - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; int r; if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } if (!page) { + unsigned long addr; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't * called if the page could not be unmapped. */ - VM_BUG_ON(!migrate->vma); - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + VM_BUG_ON(!migrate); + addr = migrate->start + i*PAGE_SIZE; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { notified = true; @@ -726,7 +734,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); + &src_pfns[i]); continue; } @@ -739,18 +747,18 @@ void migrate_vma_pages(struct migrate_vma *migrate) * device private or coherent memory. */ if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } } else if (is_zone_device_page(newpage)) { /* * Other types of ZONE_DEVICE page are not supported. */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - if (migrate->fault_page == page) + if (migrate && migrate->fault_page == page) r = migrate_folio_extra(mapping, page_folio(newpage), page_folio(page), MIGRATE_SYNC_NO_COPY, 1); @@ -758,7 +766,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) r = migrate_folio(mapping, page_folio(newpage), page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } /* @@ -769,28 +777,30 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (notified) mmu_notifier_invalidate_range_only_end(&range); } -EXPORT_SYMBOL(migrate_vma_pages); /** - * migrate_vma_finalize() - restore CPU page table entry + * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. - * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. */ -void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_vma_pages(struct migrate_vma *migrate) +{ + migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); +} +EXPORT_SYMBOL(migrate_vma_pages); + +static void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { - const unsigned long npages = migrate->npages; unsigned long i; for (i = 0; i < npages; i++) { struct folio *dst, *src; - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); if (!page) { if (newpage) { @@ -800,7 +810,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) continue; } - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { if (newpage) { unlock_page(newpage); put_page(newpage); @@ -827,6 +837,22 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } } + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); +} EXPORT_SYMBOL(migrate_vma_finalize); /* @@ -837,25 +863,19 @@ EXPORT_SYMBOL(migrate_vma_finalize); int migrate_device_coherent_page(struct page *page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma args; struct page *dpage; WARN_ON_ONCE(PageCompound(page)); lock_page(page); src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - args.src = &src_pfn; - args.dst = &dst_pfn; - args.cpages = 1; - args.npages = 1; - args.vma = NULL; /* * We don't have a VMA and don't need to walk the page tables to find * the source page. So call migrate_vma_unmap() directly to unmap the * page as migrate_vma_setup() will fail if args.vma == NULL. */ - migrate_vma_unmap(&args); + migrate_device_unmap(&src_pfn, 1, NULL); if (!(src_pfn & MIGRATE_PFN_MIGRATE)) return -EBUSY; @@ -865,10 +885,10 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_vma_pages(&args); + migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); - migrate_vma_finalize(&args); + migrate_device_finalize(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) return 0; From e778406b40dbb1342a1888cd751ca9d2982a12e2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:19 +1000 Subject: [PATCH 17/29] mm/migrate_device.c: add migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device drivers can use the migrate_vma family of functions to migrate existing private anonymous mappings to device private pages. These pages are backed by memory on the device with drivers being responsible for copying data to and from device memory. Device private pages are freed via the pgmap->page_free() callback when they are unmapped and their refcount drops to zero. Alternatively they may be freed indirectly via migration back to CPU memory in response to a pgmap->migrate_to_ram() callback called whenever the CPU accesses an address mapped to a device private page. In other words drivers cannot control the lifetime of data allocated on the devices and must wait until these pages are freed from userspace. This causes issues when memory needs to reclaimed on the device, either because the device is going away due to a ->release() callback or because another user needs to use the memory. Drivers could use the existing migrate_vma functions to migrate data off the device. However this would require them to track the mappings of each page which is both complicated and not always possible. Instead drivers need to be able to migrate device pages directly so they can free up device memory. To allow that this patch introduces the migrate_device family of functions which are functionally similar to migrate_vma but which skips the initial lookup based on mapping. Link: https://lkml.kernel.org/r/868116aab70b0c8ee467d62498bb2cf0ef907295.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Cc: Matthew Wilcox Cc: Yang Shi Cc: David Hildenbrand Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Lyude Paul Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/migrate.h | 7 ++++ mm/migrate_device.c | 89 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 52090d1f92307..3ef77f52a4f04 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -210,6 +210,13 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages); +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages); +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages); + #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 7707c1d898f55..6fa682eef7a00 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -693,7 +693,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, *src &= ~MIGRATE_PFN_MIGRATE; } -static void migrate_device_pages(unsigned long *src_pfns, +static void __migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages, struct migrate_vma *migrate) { @@ -715,6 +715,9 @@ static void migrate_device_pages(unsigned long *src_pfns, if (!page) { unsigned long addr; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't @@ -722,8 +725,6 @@ static void migrate_device_pages(unsigned long *src_pfns, */ VM_BUG_ON(!migrate); addr = migrate->start + i*PAGE_SIZE; - if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; if (!notified) { notified = true; @@ -778,6 +779,22 @@ static void migrate_device_pages(unsigned long *src_pfns, mmu_notifier_invalidate_range_only_end(&range); } +/** + * migrate_device_pages() - migrate meta-data from src page to dst page + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Equivalent to migrate_vma_pages(). This is called to migrate struct page + * meta-data from source struct page to destination. + */ +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages) +{ + __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); +} +EXPORT_SYMBOL(migrate_device_pages); + /** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information @@ -788,12 +805,22 @@ static void migrate_device_pages(unsigned long *src_pfns, */ void migrate_vma_pages(struct migrate_vma *migrate) { - migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); + __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); } EXPORT_SYMBOL(migrate_vma_pages); -static void migrate_device_finalize(unsigned long *src_pfns, - unsigned long *dst_pfns, unsigned long npages) +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { unsigned long i; @@ -837,6 +864,7 @@ static void migrate_device_finalize(unsigned long *src_pfns, } } } +EXPORT_SYMBOL(migrate_device_finalize); /** * migrate_vma_finalize() - restore CPU page table entry @@ -855,6 +883,53 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); +/** + * migrate_device_range() - migrate device private pfns to normal memory. + * @src_pfns: array large enough to hold migrating source device private pfns. + * @start: starting pfn in the range to migrate. + * @npages: number of pages to migrate. + * + * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that + * instead of looking up pages based on virtual address mappings a range of + * device pfns that should be migrated to system memory is used instead. + * + * This is useful when a driver needs to free device memory but doesn't know the + * virtual mappings of every page that may be in device memory. For example this + * is often the case when a driver is being unloaded or unbound from a device. + * + * Like migrate_vma_setup() this function will take a reference and lock any + * migrating pages that aren't free before unmapping them. Drivers may then + * allocate destination pages and start copying data from the device to CPU + * memory before calling migrate_device_pages(). + */ +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages) +{ + unsigned long i, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + if (!get_page_unless_zero(page)) { + src_pfns[i] = 0; + continue; + } + + if (!trylock_page(page)) { + src_pfns[i] = 0; + put_page(page); + continue; + } + + src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + } + + migrate_device_unmap(src_pfns, npages, NULL); + + return 0; +} +EXPORT_SYMBOL(migrate_device_range); + /* * Migrate a device coherent page back to normal memory. The caller should have * a reference on page which will be copied to the new page if migration is @@ -885,7 +960,7 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); + migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); migrate_device_finalize(&src_pfn, &dst_pfn, 1); From d9b719394a1147614351961ac454589111c76e76 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:20 +1000 Subject: [PATCH 18/29] nouveau/dmem: refactor nouveau_dmem_fault_copy_one() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nouveau_dmem_fault_copy_one() is used during handling of CPU faults via the migrate_to_ram() callback and is used to copy data from GPU to CPU memory. It is currently specific to fault handling, however a future patch implementing eviction of data during teardown needs similar functionality. Refactor out the core functionality so that it is not specific to fault handling. Link: https://lkml.kernel.org/r/20573d7b4e641a78fde9935f948e64e71c9e709e.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Lyude Paul Cc: Ben Skeggs Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: Felix Kuehling Cc: "Huang, Ying" Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 58 +++++++++++++------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index b092988266a6a..65f51fb6a70ce 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -139,44 +139,24 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence) } } -static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, - struct vm_fault *vmf, struct migrate_vma *args, - dma_addr_t *dma_addr) +static int nouveau_dmem_copy_one(struct nouveau_drm *drm, struct page *spage, + struct page *dpage, dma_addr_t *dma_addr) { struct device *dev = drm->dev->dev; - struct page *dpage, *spage; - struct nouveau_svmm *svmm; - - spage = migrate_pfn_to_page(args->src[0]); - if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) - return 0; - dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); - if (!dpage) - return VM_FAULT_SIGBUS; lock_page(dpage); *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); if (dma_mapping_error(dev, *dma_addr)) - goto error_free_page; + return -EIO; - svmm = spage->zone_device_data; - mutex_lock(&svmm->mutex); - nouveau_svmm_invalidate(svmm, args->start, args->end); if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, - NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) - goto error_dma_unmap; - mutex_unlock(&svmm->mutex); + NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) { + dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + return -EIO; + } - args->dst[0] = migrate_pfn(page_to_pfn(dpage)); return 0; - -error_dma_unmap: - mutex_unlock(&svmm->mutex); - dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); -error_free_page: - __free_page(dpage); - return VM_FAULT_SIGBUS; } static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) @@ -184,9 +164,11 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) struct nouveau_drm *drm = page_to_drm(vmf->page); struct nouveau_dmem *dmem = drm->dmem; struct nouveau_fence *fence; + struct nouveau_svmm *svmm; + struct page *spage, *dpage; unsigned long src = 0, dst = 0; dma_addr_t dma_addr = 0; - vm_fault_t ret; + vm_fault_t ret = 0; struct migrate_vma args = { .vma = vmf->vma, .start = vmf->address, @@ -207,9 +189,25 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) if (!args.cpages) return 0; - ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr); - if (ret || dst == 0) + spage = migrate_pfn_to_page(src); + if (!spage || !(src & MIGRATE_PFN_MIGRATE)) + goto done; + + dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); + if (!dpage) + goto done; + + dst = migrate_pfn(page_to_pfn(dpage)); + + svmm = spage->zone_device_data; + mutex_lock(&svmm->mutex); + nouveau_svmm_invalidate(svmm, args.start, args.end); + ret = nouveau_dmem_copy_one(drm, spage, dpage, &dma_addr); + mutex_unlock(&svmm->mutex); + if (ret) { + ret = VM_FAULT_SIGBUS; goto done; + } nouveau_fence_new(dmem->migrate.chan, false, &fence); migrate_vma_pages(&args); From 249881232e1471d28b68f9a3829acc14d150cf5d Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:21 +1000 Subject: [PATCH 19/29] nouveau/dmem: evict device private memory during release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the module is unloaded or a GPU is unbound from the module it is possible for device private pages to still be mapped in currently running processes. This can lead to a hangs and RCU stall warnings when unbinding the device as memunmap_pages() will wait in an uninterruptible state until all device pages have been freed which may never happen. Fix this by migrating device mappings back to normal CPU memory prior to freeing the GPU memory chunks and associated device private pages. Link: https://lkml.kernel.org/r/66277601fb8fda9af408b33da9887192bf895bda.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: Lyude Paul Cc: Ben Skeggs Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: Felix Kuehling Cc: "Huang, Ying" Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 65f51fb6a70ce..5fe209107246f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -367,6 +367,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm) mutex_unlock(&drm->dmem->mutex); } +/* + * Evict all pages mapping a chunk. + */ +static void +nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) +{ + unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT; + unsigned long *src_pfns, *dst_pfns; + dma_addr_t *dma_addrs; + struct nouveau_fence *fence; + + src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL); + dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL); + dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL); + + migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT, + npages); + + for (i = 0; i < npages; i++) { + if (src_pfns[i] & MIGRATE_PFN_MIGRATE) { + struct page *dpage; + + /* + * _GFP_NOFAIL because the GPU is going away and there + * is nothing sensible we can do if we can't copy the + * data back. + */ + dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL); + dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + nouveau_dmem_copy_one(chunk->drm, + migrate_pfn_to_page(src_pfns[i]), dpage, + &dma_addrs[i]); + } + } + + nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence); + migrate_device_pages(src_pfns, dst_pfns, npages); + nouveau_dmem_fence_done(&fence); + migrate_device_finalize(src_pfns, dst_pfns, npages); + kfree(src_pfns); + kfree(dst_pfns); + for (i = 0; i < npages; i++) + dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL); + kfree(dma_addrs); +} + void nouveau_dmem_fini(struct nouveau_drm *drm) { @@ -378,8 +424,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm) mutex_lock(&drm->dmem->mutex); list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) { + nouveau_dmem_evict_chunk(chunk); nouveau_bo_unpin(chunk->bo); nouveau_bo_ref(NULL, &chunk->bo); + WARN_ON(chunk->callocated); list_del(&chunk->list); memunmap_pages(&chunk->pagemap); release_mem_region(chunk->pagemap.range.start, From ad4c365221b0f92f9c24a203119f2bade30c970e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:22 +1000 Subject: [PATCH 20/29] hmm-tests: add test for migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://lkml.kernel.org/r/a73cf109de0224cfd118d22be58ddebac3ae2897.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: Jason Gunthorpe Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Sierra Cc: Felix Kuehling Cc: Alex Deucher Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Lyude Paul Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- lib/test_hmm.c | 120 ++++++++++++++++++++----- lib/test_hmm_uapi.h | 1 + tools/testing/selftests/vm/hmm-tests.c | 49 ++++++++++ 3 files changed, 149 insertions(+), 21 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index bc2b949911653..67e6f83fe0f82 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -100,6 +100,7 @@ struct dmirror { struct dmirror_chunk { struct dev_pagemap pagemap; struct dmirror_device *mdevice; + bool remove; }; /* @@ -192,11 +193,15 @@ static int dmirror_fops_release(struct inode *inode, struct file *filp) return 0; } +static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page) +{ + return container_of(page->pgmap, struct dmirror_chunk, pagemap); +} + static struct dmirror_device *dmirror_page_to_device(struct page *page) { - return container_of(page->pgmap, struct dmirror_chunk, - pagemap)->mdevice; + return dmirror_page_to_chunk(page)->mdevice; } static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) @@ -1218,6 +1223,85 @@ static int dmirror_snapshot(struct dmirror *dmirror, return ret; } +static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) +{ + unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT; + unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT; + unsigned long npages = end_pfn - start_pfn + 1; + unsigned long i; + unsigned long *src_pfns; + unsigned long *dst_pfns; + + src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL); + dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL); + + migrate_device_range(src_pfns, start_pfn, npages); + for (i = 0; i < npages; i++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(src_pfns[i]); + if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_device_private_page(spage) && + !is_device_coherent_page(spage))) + continue; + spage = BACKING_PAGE(spage); + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + lock_page(dpage); + copy_highpage(dpage, spage); + dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + if (src_pfns[i] & MIGRATE_PFN_WRITE) + dst_pfns[i] |= MIGRATE_PFN_WRITE; + } + migrate_device_pages(src_pfns, dst_pfns, npages); + migrate_device_finalize(src_pfns, dst_pfns, npages); + kfree(src_pfns); + kfree(dst_pfns); +} + +/* Removes free pages from the free list so they can't be re-allocated */ +static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) +{ + struct dmirror_device *mdevice = devmem->mdevice; + struct page *page; + + for (page = mdevice->free_pages; page; page = page->zone_device_data) + if (dmirror_page_to_chunk(page) == devmem) + mdevice->free_pages = page->zone_device_data; +} + +static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) +{ + unsigned int i; + + mutex_lock(&mdevice->devmem_lock); + if (mdevice->devmem_chunks) { + for (i = 0; i < mdevice->devmem_count; i++) { + struct dmirror_chunk *devmem = + mdevice->devmem_chunks[i]; + + spin_lock(&mdevice->lock); + devmem->remove = true; + dmirror_remove_free_pages(devmem); + spin_unlock(&mdevice->lock); + + dmirror_device_evict_chunk(devmem); + memunmap_pages(&devmem->pagemap); + if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); + kfree(devmem); + } + mdevice->devmem_count = 0; + mdevice->devmem_capacity = 0; + mdevice->free_pages = NULL; + kfree(mdevice->devmem_chunks); + mdevice->devmem_chunks = NULL; + } + mutex_unlock(&mdevice->devmem_lock); +} + static long dmirror_fops_unlocked_ioctl(struct file *filp, unsigned int command, unsigned long arg) @@ -1272,6 +1356,11 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_snapshot(dmirror, &cmd); break; + case HMM_DMIRROR_RELEASE: + dmirror_device_remove_chunks(dmirror->mdevice); + ret = 0; + break; + default: return -EINVAL; } @@ -1326,9 +1415,13 @@ static void dmirror_devmem_free(struct page *page) mdevice = dmirror_page_to_device(page); spin_lock(&mdevice->lock); - mdevice->cfree++; - page->zone_device_data = mdevice->free_pages; - mdevice->free_pages = page; + + /* Return page to our allocator if not freeing the chunk */ + if (!dmirror_page_to_chunk(page)->remove) { + mdevice->cfree++; + page->zone_device_data = mdevice->free_pages; + mdevice->free_pages = page; + } spin_unlock(&mdevice->lock); } @@ -1408,22 +1501,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) static void dmirror_device_remove(struct dmirror_device *mdevice) { - unsigned int i; - - if (mdevice->devmem_chunks) { - for (i = 0; i < mdevice->devmem_count; i++) { - struct dmirror_chunk *devmem = - mdevice->devmem_chunks[i]; - - memunmap_pages(&devmem->pagemap); - if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); - kfree(devmem); - } - kfree(mdevice->devmem_chunks); - } - + dmirror_device_remove_chunks(mdevice); cdev_device_del(&mdevice->cdevice, &mdevice->device); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index e31d58c9034a7..8c818a2cf4f69 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -36,6 +36,7 @@ struct hmm_dmirror_cmd { #define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) #define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) #define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index f2c2c970eeb27..28232adec883b 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1054,6 +1054,55 @@ TEST_F(hmm, migrate_fault) hmm_buffer_free(buffer); } +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + /* * Migrate anonymous shared memory to device private memory. */ From d6e5040bd8e53371fafd7e0c7c63b090b3a675db Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 26 Sep 2022 20:08:47 +0200 Subject: [PATCH 21/29] kasan: fix array-bounds warnings in tests GCC's -Warray-bounds option detects out-of-bounds accesses to statically-sized allocations in krealloc out-of-bounds tests. Use OPTIMIZER_HIDE_VAR to suppress the warning. Also change kmalloc_memmove_invalid_size to use OPTIMIZER_HIDE_VAR instead of a volatile variable. Link: https://lkml.kernel.org/r/e94399242d32e00bba6fd0d9ec4c897f188128e8.1664215688.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Reviewed-by: Kees Cook Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index f25692def7813..57e4c72aa8bd2 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* All offsets up to size2 must be accessible. */ ptr2[size1 - 1] = 'x'; ptr2[size1] = 'x'; @@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* Must be accessible for all modes. */ ptr2[size2 - 1] = 'x'; @@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = size; + size_t invalid_size = size; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); From bce8cb3c04dc01d21b6b17baf1cb6c277e7e6848 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 29 Sep 2022 19:23:17 +0800 Subject: [PATCH 22/29] mm: use update_mmu_tlb() on the second thread As message in commit 7df676974359 ("mm/memory.c: Update local TLB if PTE entry exists") said, we should update local TLB only on the second thread. So in the do_anonymous_page() here, we should use update_mmu_tlb() instead of update_mmu_cache() on the second thread. As David pointed out, this is a performance improvement, not a correctness fix. Link: https://lkml.kernel.org/r/20220929112318.32393-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Bibo Mao Cc: Chris Zankel Cc: Huacai Chen Cc: Max Filippov Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 4ad6077164cd2..f88c351aecd41 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4134,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; } From 14c2ac36811b82479b1138383b2c9ff1ab6ba47d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 29 Sep 2022 19:23:18 +0800 Subject: [PATCH 23/29] LoongArch: update local TLB if PTE entry exists Currently, the implementation of update_mmu_tlb() is empty if __HAVE_ARCH_UPDATE_MMU_TLB is not defined. Then if two threads concurrently fault at the same page, the second thread that did not win the race will give up and do nothing. In the LoongArch architecture, this second thread will trigger another fault, and only updates its local TLB. Instead of triggering another fault, it's better to implement update_mmu_tlb() to directly update the local TLB of the second thread. Just do it. Link: https://lkml.kernel.org/r/20220929112318.32393-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Bibo Mao Acked-by: Huacai Chen Cc: Chris Zankel Cc: David Hildenbrand Cc: Max Filippov Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 8ea57e2f0e04c..946704bee599e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -412,6 +412,9 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, __update_tlb(vma, address, ptep); } +#define __HAVE_ARCH_UPDATE_MMU_TLB +#define update_mmu_tlb update_mmu_cache + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { From 94541bc3fbde45bbd40e7989995246b22732679a Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Mon, 3 Oct 2022 10:48:32 -0400 Subject: [PATCH 24/29] zram: always expose rw_page Currently zram will adjust its fops to a version which does not contain rw_page when a backing device has been assigned. This is done to prevent upper layers from assuming a synchronous operation when a page may have been written back. This forces every operation through bio which has overhead associated with bio_alloc/frees. The code can be simplified to always expose an rw_page method and only in the rare event that a page is written back we instead will return -EOPNOTSUPP forcing the upper layer to fallback to bio. Link: https://lkml.kernel.org/r/20221003144832.2906610-1-bgeffon@google.com Signed-off-by: Brian Geffon Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Rom Lemarchand Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 43eeef2b9fbe2..2ba5c98319e52 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -52,9 +52,6 @@ static unsigned int num_devices = 1; static size_t huge_class_size; static const struct block_device_operations zram_devops; -#ifdef CONFIG_ZRAM_WRITEBACK -static const struct block_device_operations zram_wb_devops; -#endif static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, @@ -546,17 +543,6 @@ static ssize_t backing_dev_store(struct device *dev, zram->backing_dev = backing_dev; zram->bitmap = bitmap; zram->nr_pages = nr_pages; - /* - * With writeback feature, zram does asynchronous IO so it's no longer - * synchronous device so let's remove synchronous io flag. Othewise, - * upper layer(e.g., swap) could wait IO completion rather than - * (submit and return), which will cause system sluggish. - * Furthermore, when the IO function returns(e.g., swap_readpage), - * upper layer expects IO was done so it could deallocate the page - * freely but in fact, IO is going on so finally could cause - * use-after-free when the IO is really done. - */ - zram->disk->fops = &zram_wb_devops; up_write(&zram->init_lock); pr_info("setup backing device %s\n", file_name); @@ -1270,6 +1256,9 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, struct bio_vec bvec; zram_slot_unlock(zram, index); + /* A null bio means rw_page was used, we must fallback to bio */ + if (!bio) + return -EOPNOTSUPP; bvec.bv_page = page; bvec.bv_len = PAGE_SIZE; @@ -1856,15 +1845,6 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; -#ifdef CONFIG_ZRAM_WRITEBACK -static const struct block_device_operations zram_wb_devops = { - .open = zram_open, - .submit_bio = zram_submit_bio, - .swap_slot_free_notify = zram_slot_free_notify, - .owner = THIS_MODULE -}; -#endif - static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); From 2ea7ff1e39cbe3753d3c649beb70f2cf861dca75 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:33:58 -0400 Subject: [PATCH 25/29] mm/hugetlb: fix race condition of uffd missing/minor handling Patch series "mm/hugetlb: Fix selftest failures with write check", v3. Currently akpm mm-unstable fails with uffd hugetlb private mapping test randomly on a write check. The initial bisection of that points to the recent pmd unshare series, but it turns out there's no direction relationship with the series but only some timing change caused the race to start trigger. The race should be fixed in patch 1. Patch 2 is a trivial cleanup on the similar race with hugetlb migrations, patch 3 comment on the write check so when anyone read it again it'll be clear why it's there. This patch (of 3): After the recent rework patchset of hugetlb locking on pmd sharing, kselftest for userfaultfd sometimes fails on hugetlb private tests with unexpected write fault checks. It turns out there's nothing wrong within the locking series regarding this matter, but it could have changed the timing of threads so it can trigger an old bug. The real bug is when we call hugetlb_no_page() we're not with the pgtable lock. It means we're reading the pte values lockless. It's perfectly fine in most cases because before we do normal page allocations we'll take the lock and check pte_same() again. However before that, there are actually two paths on userfaultfd missing/minor handling that may directly move on with the fault process without checking the pte values. It means for these two paths we may be generating an uffd message based on an unstable pte, while an unstable pte can legally be anything as long as the modifier holds the pgtable lock. One example, which is also what happened in the failing kselftest and caused the test failure, is that for private mappings wr-protection changes can happen on one page. While hugetlb_change_protection() generally requires pte being cleared before being changed, then there can be a race condition like: thread 1 thread 2 -------- -------- UFFDIO_WRITEPROTECT hugetlb_fault hugetlb_change_protection pgtable_lock() huge_ptep_modify_prot_start pte==NULL hugetlb_no_page generate uffd missing event even if page existed!! huge_ptep_modify_prot_commit pgtable_unlock() Fix this by rechecking the pte after pgtable lock for both userfaultfd missing & minor fault paths. This bug should have been around starting from uffd hugetlb introduced, so attaching a Fixes to the commit. Also attach another Fixes to the minor support commit for easier tracking. Note that userfaultfd is actually fine with false positives (e.g. caused by pte changed), but not wrong logical events (e.g. caused by reading a pte during changing). The latter can confuse the userspace, so the strictness is very much preferred. E.g., MISSING event should never happen on the page after UFFDIO_COPY has correctly installed the page and returned. Link: https://lkml.kernel.org/r/20221004193400.110155-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221004193400.110155-2-peterx@redhat.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode") Signed-off-by: Peter Xu Co-developed-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Nadav Amit Cc: David Hildenbrand Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/hugetlb.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a910612336da..bf9d8d04bf4f8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5535,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, return handle_userfault(&vmf, reason); } +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -5575,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MISSING); + if (userfaultfd_missing(vma)) { + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); + } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5644,9 +5684,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MINOR); + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); } } From f9bf6c03eca1077cae8de0e6d86427656fa42a9b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:33:59 -0400 Subject: [PATCH 26/29] mm/hugetlb: use hugetlb_pte_stable in migration race check After hugetlb_pte_stable() introduced, we can also rewrite the migration race condition against page allocation to use the new helper too. Link: https://lkml.kernel.org/r/20221004193400.110155-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bf9d8d04bf4f8..9b26055f31197 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5634,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - ptl = huge_pte_lock(h, mm, ptep); - ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(page)); - spin_unlock(ptl); + else + ret = 0; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); From 26c92d37d3dc484157bdb4eb7d29991c017b168b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:34:00 -0400 Subject: [PATCH 27/29] mm/selftest: uffd: explain the write missing fault check It's not obvious why we had a write check for each of the missing messages, especially when it should be a locking op. Add a rich comment for that, and also try to explain its good side and limitations, so that if someone hit it again for either a bug or a different glibc impl there'll be some clue to start with. Link: https://lkml.kernel.org/r/20221004193400.110155-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 74babdbc02e56..297f250c1d956 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -774,7 +774,27 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, continue_range(uffd, msg->arg.pagefault.address, page_size); stats->minor_faults++; } else { - /* Missing page faults */ + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) err("unexpected write fault"); From 15cd90049d595e592d8860ee15a3f23491d54d17 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 6 Oct 2022 10:15:40 +0000 Subject: [PATCH 28/29] mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page PGFREE and PGALLOC represent the number of freed and allocated pages. So the page order must be considered. Link: https://lkml.kernel.org/r/20221006101540.40686-1-laoar.shao@gmail.com Fixes: 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") Signed-off-by: Yafang Shao Acked-by: Mel Gorman Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 059f6946832fa..8e9b7f08a32ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3446,7 +3446,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high; - __count_vm_event(PGFREE); + __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3803,7 +3803,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp_spin_unlock_irqrestore(pcp, flags); pcp_trylock_finish(UP_flags); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); } return page; From ef6e06b2ef87077104d1145a0fd452ff8dbbc4b7 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 5 Oct 2022 21:05:55 -0700 Subject: [PATCH 29/29] highmem: fix kmap_to_page() for kmap_local_page() addresses kmap_to_page() is used to get the page for a virtual address which may be kmap'ed. Unfortunately, kmap_local_page() stores mappings in a thread local array separate from kmap(). These mappings were not checked by the call. Check the kmap_local_page() mappings and return the page if found. Because it is intended to remove kmap_to_page() add a warn on once to the kmap checks to flag potential issues early. NOTE Due to 32bit x86 use of kmap local in iomap atmoic, KMAP_LOCAL does not require HIGHMEM to be set. Therefore the support calls required a new KMAP_LOCAL section to fix 0day build errors. [akpm@linux-foundation.org: fix warning] Link: https://lkml.kernel.org/r/20221006040555.1502679-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Reported-by: Al Viro Reported-by: kernel test robot Cc: "Fabio M. De Francesco" Cc: Thomas Gleixner Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/highmem.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index c707d7202d5f7..db251e77f98f8 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -30,6 +30,17 @@ #include #include +#ifdef CONFIG_KMAP_LOCAL +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif +#endif /* CONFIG_KMAP_LOCAL */ + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -142,12 +153,29 @@ pte_t *pkmap_page_table; struct page *__kmap_to_page(void *vaddr) { + unsigned long base = (unsigned long) vaddr & PAGE_MASK; + struct kmap_ctrl *kctrl = ¤t->kmap_ctrl; unsigned long addr = (unsigned long)vaddr; + int i; + + /* kmap() mappings */ + if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && + addr < PKMAP_ADDR(LAST_PKMAP))) + return pte_page(pkmap_page_table[PKMAP_NR(addr)]); - if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { - int i = PKMAP_NR(addr); + /* kmap_local_page() mappings */ + if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && + base < __fix_to_virt(FIX_KMAP_BEGIN))) { + for (i = 0; i < kctrl->idx; i++) { + unsigned long base_addr; + int idx; - return pte_page(pkmap_page_table[i]); + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + + if (base_addr == base) + return pte_page(kctrl->pteval[i]); + } } return virt_to_page(vaddr); @@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void) # define arch_kmap_local_post_unmap(vaddr) do { } while (0) #endif -#ifndef arch_kmap_local_map_idx -#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) -#endif - #ifndef arch_kmap_local_unmap_idx #define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) #endif @@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) return false; } -static inline int kmap_local_calc_idx(int idx) -{ - return idx + KM_MAX_IDX * smp_processor_id(); -} - static pte_t *__kmap_pte; static pte_t *kmap_get_pte(unsigned long vaddr, int idx)