From b808f629215685c1941b1cd567c7b7ccb3c90278 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 9 Aug 2024 13:25:11 +0500 Subject: [PATCH 01/17] selftests: mm: fix build errors on armhf The __NR_mmap isn't found on armhf. The mmap() is commonly available system call and its wrapper is present on all architectures. So it should be used directly. It solves problem for armhf and doesn't create problem for other architectures. Remove sys_mmap() functions as they aren't doing anything else other than calling mmap(). There is no need to set errno = 0 manually as glibc always resets it. For reference errors are as following: CC seal_elf seal_elf.c: In function 'sys_mmap': seal_elf.c:39:33: error: '__NR_mmap' undeclared (first use in this function) 39 | sret = (void *) syscall(__NR_mmap, addr, len, prot, | ^~~~~~~~~ mseal_test.c: In function 'sys_mmap': mseal_test.c:90:33: error: '__NR_mmap' undeclared (first use in this function) 90 | sret = (void *) syscall(__NR_mmap, addr, len, prot, | ^~~~~~~~~ Link: https://lkml.kernel.org/r/20240809082511.497266-1-usama.anjum@collabora.com Fixes: 4926c7a52de7 ("selftest mm/mseal memory sealing") Signed-off-by: Muhammad Usama Anjum Cc: Jeff Xu Cc: Kees Cook Cc: Liam R. Howlett Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 37 +++++++++---------------- tools/testing/selftests/mm/seal_elf.c | 13 +-------- 2 files changed, 14 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index a818f010de479..bfcea5cf9a484 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -81,17 +81,6 @@ static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static int sys_munmap(void *ptr, size_t size) { int sret; @@ -172,7 +161,7 @@ static void setup_single_address(int size, void **ptrOut) { void *ptr; - ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); *ptrOut = ptr; } @@ -181,7 +170,7 @@ static void setup_single_address_rw(int size, void **ptrOut) void *ptr; unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; - ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); *ptrOut = ptr; } @@ -205,7 +194,7 @@ bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; @@ -481,8 +470,8 @@ static void test_seal_zero_address(void) int prot; /* use mmap to change protection. */ - ptr = sys_mmap(0, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ptr = mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); FAIL_TEST_IF_FALSE(ptr == 0); size = get_vma_size(ptr, &prot); @@ -1209,8 +1198,8 @@ static void test_seal_mmap_overwrite_prot(bool seal) } /* use mmap to change protection. */ - ret2 = sys_mmap(ptr, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1240,8 +1229,8 @@ static void test_seal_mmap_expand(bool seal) } /* use mmap to expand. */ - ret2 = sys_mmap(ptr, size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1268,8 +1257,8 @@ static void test_seal_mmap_shrink(bool seal) } /* use mmap to shrink. */ - ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1650,7 +1639,7 @@ static void test_seal_discard_ro_anon_on_filebacked(bool seal) ret = fallocate(fd, 0, 0, size); FAIL_TEST_IF_FALSE(!ret); - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, fd, 0); FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); if (seal) { @@ -1680,7 +1669,7 @@ static void test_seal_discard_ro_anon_on_shared(bool seal) int ret; unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, -1, 0); FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index 7aa1366063e4e..d9f8ba8d5050b 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -30,17 +30,6 @@ static int sys_mseal(void *start, size_t len) return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) { int sret; @@ -56,7 +45,7 @@ static bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; From 3e3de7947c751509027d26b679ecd243bc9db255 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 12 Aug 2024 18:16:06 +0100 Subject: [PATCH 02/17] mm: vmalloc: ensure vmap_block is initialised before adding to queue Commit 8c61291fd850 ("mm: fix incorrect vbq reference in purge_fragmented_block") extended the 'vmap_block' structure to contain a 'cpu' field which is set at allocation time to the id of the initialising CPU. When a new 'vmap_block' is being instantiated by new_vmap_block(), the partially initialised structure is added to the local 'vmap_block_queue' xarray before the 'cpu' field has been initialised. If another CPU is concurrently walking the xarray (e.g. via vm_unmap_aliases()), then it may perform an out-of-bounds access to the remote queue thanks to an uninitialised index. This has been observed as UBSAN errors in Android: | Internal error: UBSAN: array index out of bounds: 00000000f2005512 [#1] PREEMPT SMP | | Call trace: | purge_fragmented_block+0x204/0x21c | _vm_unmap_aliases+0x170/0x378 | vm_unmap_aliases+0x1c/0x28 | change_memory_common+0x1dc/0x26c | set_memory_ro+0x18/0x24 | module_enable_ro+0x98/0x238 | do_init_module+0x1b0/0x310 Move the initialisation of 'vb->cpu' in new_vmap_block() ahead of the addition to the xarray. Link: https://lkml.kernel.org/r/20240812171606.17486-1-will@kernel.org Fixes: 8c61291fd850 ("mm: fix incorrect vbq reference in purge_fragmented_block") Signed-off-by: Will Deacon Reviewed-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Cc: Zhaoyang Huang Cc: Hailong.Liu Cc: Christoph Hellwig Cc: Lorenzo Stoakes Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af2de36549d60..ac53d46ac8a5c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2626,6 +2626,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); + vb->cpu = raw_smp_processor_id(); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); @@ -2642,7 +2643,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) * integrity together with list_for_each_rcu from read * side. */ - vb->cpu = raw_smp_processor_id(); vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 13 Aug 2024 22:25:21 +0200 Subject: [PATCH 03/17] userfaultfd: fix checks for huge PMDs Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2. The pmd_trans_huge() code in mfill_atomic() is wrong in three different ways depending on kernel version: 1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit the right two race windows) - I've tested this in a kernel build with some extra mdelay() calls. See the commit message for a description of the race scenario. On older kernels (before 6.5), I think the same bug can even theoretically lead to accessing transhuge page contents as a page table if you hit the right 5 narrow race windows (I haven't tested this case). 2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for detecting PMDs that don't point to page tables. On older kernels (before 6.5), you'd just have to win a single fairly wide race to hit this. I've tested this on 6.1 stable by racing migration (with a mdelay() patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86 VM, that causes a kernel oops in ptlock_ptr(). 3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed to yank page tables out from under us (though I haven't tested that), so I think the BUG_ON() checks in mfill_atomic() are just wrong. I decided to write two separate fixes for these (one fix for bugs 1+2, one fix for bug 3), so that the first fix can be backported to kernels affected by bugs 1+2. This patch (of 2): This fixes two issues. I discovered that the following race can occur: mfill_atomic other thread ============ ============ pmdp_get_lockless() [reads none pmd] __pte_alloc [no-op] BUG_ON(pmd_none(*dst_pmd)) I have experimentally verified this in a kernel with extra mdelay() calls; the BUG_ON(pmd_none(*dst_pmd)) triggers. On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow pte_offset_map[_lock]() to fail"), this can't lead to anything worse than a BUG_ON(), since the page table access helpers are actually designed to deal with page tables concurrently disappearing; but on older kernels (<=6.4), I think we could probably theoretically race past the two BUG_ON() checks and end up treating a hugepage as a page table. The second issue is that, as Qi Zheng pointed out, there are other types of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs (in particular, migration PMDs). On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a PMD that contains a migration entry (which just requires winning a single, fairly wide race), it will pass the PMD to pte_offset_map_lock(), which assumes that the PMD points to a page table. Breakage follows: First, the kernel tries to take the PTE lock (which will crash or maybe worse if there is no "struct page" for the address bits in the migration entry PMD - I think at least on X86 there usually is no corresponding "struct page" thanks to the PTE inversion mitigation, amd64 looks different). If that didn't crash, the kernel would next try to write a PTE into what it wrongly thinks is a page table. As part of fixing these issues, get rid of the check for pmd_trans_huge() before __pte_alloc() - that's redundant, we're going to have to check for that after the __pte_alloc() anyway. Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels. Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@google.com Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@google.com Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation") Signed-off-by: Jann Horn Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jann Horn Cc: Pavel Emelyanov Cc: Qi Zheng Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e54e5c8907fa2..290b2a0d84ac5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, } dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is mapped as THP don't - * override it and just be strict. - */ - if (unlikely(pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } - /* If an huge pmd materialized from under us fail */ - if (unlikely(pmd_trans_huge(*dst_pmd))) { + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || + pmd_devmap(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_bad(dst_pmdval))) { err = -EFAULT; break; } From 4828d207dc5161dc7ddf9a4f6dcfd80c7dd7d20a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 13 Aug 2024 22:25:22 +0200 Subject: [PATCH 04/17] userfaultfd: don't BUG_ON() if khugepaged yanks our page table Since khugepaged was changed to allow retracting page tables in file mappings without holding the mmap lock, these BUG_ON()s are wrong - get rid of them. We could also remove the preceding "if (unlikely(...))" block, but then we could reach pte_offset_map_lock() with transhuge pages not just for file mappings but also for anonymous mappings - which would probably be fine but I think is not necessarily expected. Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-2-5efa61078a41@google.com Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock") Signed-off-by: Jann Horn Reviewed-by: Qi Zheng Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Pavel Emelyanov Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 290b2a0d84ac5..acc56c75ba994 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -807,9 +807,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, err = -EFAULT; break; } - - BUG_ON(pmd_none(*dst_pmd)); - BUG_ON(pmd_trans_huge(*dst_pmd)); + /* + * For shmem mappings, khugepaged is allowed to remove page + * tables under us; pte_offset_map_lock() will deal with that. + */ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); From 683408258917541bdb294cd717c210a04381931e Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 11 Aug 2024 19:03:20 +0900 Subject: [PATCH 05/17] nilfs2: protect references to superblock parameters exposed in sysfs The superblock buffers of nilfs2 can not only be overwritten at runtime for modifications/repairs, but they are also regularly swapped, replaced during resizing, and even abandoned when degrading to one side due to backing device issues. So, accessing them requires mutual exclusion using the reader/writer semaphore "nilfs->ns_sem". Some sysfs attribute show methods read this superblock buffer without the necessary mutual exclusion, which can cause problems with pointer dereferencing and memory access, so fix it. Link: https://lkml.kernel.org/r/20240811100320.9913-1-konishi.ryusuke@gmail.com Fixes: da7141fb78db ("nilfs2: add /sys/fs/nilfs2/ group") Signed-off-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sysfs.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index a5569b7f47a39..14868a3dd592c 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -836,9 +836,15 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u32 major = le32_to_cpu(sbp[0]->s_rev_level); - u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + struct nilfs_super_block *raw_sb; + u32 major; + u16 minor; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + major = le32_to_cpu(raw_sb->s_rev_level); + minor = le16_to_cpu(raw_sb->s_minor_rev_level); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%d.%d\n", major, minor); } @@ -856,8 +862,13 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + struct nilfs_super_block *raw_sb; + u64 dev_size; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + dev_size = le64_to_cpu(raw_sb->s_dev_size); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%llu\n", dev_size); } @@ -879,9 +890,15 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; - return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid); + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = sysfs_emit(buf, "%pUb\n", raw_sb->s_uuid); + up_read(&nilfs->ns_sem); + + return len; } static @@ -889,10 +906,16 @@ ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = scnprintf(buf, sizeof(raw_sb->s_volume_name), "%s\n", + raw_sb->s_volume_name); + up_read(&nilfs->ns_sem); - return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", - sbp[0]->s_volume_name); + return len; } static const char dev_readme_str[] = From 5787fcaab9eb5930f5378d6a1dd03d916d146622 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 10 Aug 2024 15:52:42 +0900 Subject: [PATCH 06/17] nilfs2: fix missing cleanup on rollforward recovery error In an error injection test of a routine for mount-time recovery, KASAN found a use-after-free bug. It turned out that if data recovery was performed using partial logs created by dsync writes, but an error occurred before starting the log writer to create a recovered checkpoint, the inodes whose data had been recovered were left in the ns_dirty_files list of the nilfs object and were not freed. Fix this issue by cleaning up inodes that have read the recovery data if the recovery routine fails midway before the log writer starts. Link: https://lkml.kernel.org/r/20240810065242.3701-1-konishi.ryusuke@gmail.com Fixes: 0f3e1c7f23f8 ("nilfs2: recovery functions") Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index b638dc06df2f7..61e25a980f73e 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -715,6 +715,33 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, brelse(bh); } +/** + * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery + * @nilfs: nilfs object + */ +static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + LIST_HEAD(head); + + /* Abandon inodes that have read recovery data */ + spin_lock(&nilfs->ns_inode_lock); + list_splice_init(&nilfs->ns_dirty_files, &head); + spin_unlock(&nilfs->ns_inode_lock); + if (list_empty(&head)) + return; + + set_nilfs_purging(nilfs); + list_for_each_entry_safe(ii, n, &head, i_dirty) { + spin_lock(&nilfs->ns_inode_lock); + list_del_init(&ii->i_dirty); + spin_unlock(&nilfs->ns_inode_lock); + + iput(&ii->vfs_inode); + } + clear_nilfs_purging(nilfs); +} + /** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object @@ -773,15 +800,19 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); - goto failed; + goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } - failed: +put_root: nilfs_put_root(root); return err; + +failed: + nilfs_abort_roll_forward(nilfs); + goto put_root; } /** From 6576dd6695f2afca3f4954029ac4a64f82ba60ab Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 14 Aug 2024 19:11:19 +0900 Subject: [PATCH 07/17] nilfs2: fix state management in error path of log writing function After commit a694291a6211 ("nilfs2: separate wait function from nilfs_segctor_write") was applied, the log writing function nilfs_segctor_do_construct() was able to issue I/O requests continuously even if user data blocks were split into multiple logs across segments, but two potential flaws were introduced in its error handling. First, if nilfs_segctor_begin_construction() fails while creating the second or subsequent logs, the log writing function returns without calling nilfs_segctor_abort_construction(), so the writeback flag set on pages/folios will remain uncleared. This causes page cache operations to hang waiting for the writeback flag. For example, truncate_inode_pages_final(), which is called via nilfs_evict_inode() when an inode is evicted from memory, will hang. Second, the NILFS_I_COLLECTED flag set on normal inodes remain uncleared. As a result, if the next log write involves checkpoint creation, that's fine, but if a partial log write is performed that does not, inodes with NILFS_I_COLLECTED set are erroneously removed from the "sc_dirty_files" list, and their data and b-tree blocks may not be written to the device, corrupting the block mapping. Fix these issues by uniformly calling nilfs_segctor_abort_construction() on failure of each step in the loop in nilfs_segctor_do_construct(), having it clean up logs and segment usages according to progress, and correcting the conditions for calling nilfs_redirty_inodes() to ensure that the NILFS_I_COLLECTED flag is cleared. Link: https://lkml.kernel.org/r/20240814101119.4070-1-konishi.ryusuke@gmail.com Fixes: a694291a6211 ("nilfs2: separate wait function from nilfs_segctor_write") Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0ca3110d63868..871ec35ea8e8a 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1812,6 +1812,9 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); + if (list_empty(&logs)) + return; /* if the first segment buffer preparation failed */ + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); @@ -2056,7 +2059,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) err = nilfs_segctor_begin_construction(sci, nilfs); if (unlikely(err)) - goto out; + goto failed; /* Update time stamp */ sci->sc_seg_ctime = ktime_get_real_seconds(); @@ -2120,10 +2123,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) return err; failed_to_write: - if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) - nilfs_redirty_inodes(&sci->sc_dirty_files); - failed: + if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE) + nilfs_redirty_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); nilfs_segctor_abort_construction(sci, nilfs, err); From ab7ca09520e9c41c219a4427fe0dae24024bfe7f Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Fri, 16 Aug 2024 09:33:36 +0800 Subject: [PATCH 08/17] mm/slub: add check for s->flags in the alloc_tagging_slab_free_hook When enable CONFIG_MEMCG & CONFIG_KFENCE & CONFIG_KMEMLEAK, the following warning always occurs,This is because the following call stack occurred: mem_pool_alloc kmem_cache_alloc_noprof slab_alloc_node kfence_alloc Once the kfence allocation is successful,slab->obj_exts will not be empty, because it has already been assigned a value in kfence_init_pool. Since in the prepare_slab_obj_exts_hook function,we perform a check for s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE),the alloc_tag_add function will not be called as a result.Therefore,ref->ct remains NULL. However,when we call mem_pool_free,since obj_ext is not empty, it eventually leads to the alloc_tag_sub scenario being invoked. This is where the warning occurs. So we should add corresponding checks in the alloc_tagging_slab_free_hook. For __GFP_NO_OBJ_EXT case,I didn't see the specific case where it's using kfence,so I won't add the corresponding check in alloc_tagging_slab_free_hook for now. [ 3.734349] ------------[ cut here ]------------ [ 3.734807] alloc_tag was not set [ 3.735129] WARNING: CPU: 4 PID: 40 at ./include/linux/alloc_tag.h:130 kmem_cache_free+0x444/0x574 [ 3.735866] Modules linked in: autofs4 [ 3.736211] CPU: 4 UID: 0 PID: 40 Comm: ksoftirqd/4 Tainted: G W 6.11.0-rc3-dirty #1 [ 3.736969] Tainted: [W]=WARN [ 3.737258] Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022 [ 3.737875] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 3.738501] pc : kmem_cache_free+0x444/0x574 [ 3.738951] lr : kmem_cache_free+0x444/0x574 [ 3.739361] sp : ffff80008357bb60 [ 3.739693] x29: ffff80008357bb70 x28: 0000000000000000 x27: 0000000000000000 [ 3.740338] x26: ffff80008207f000 x25: ffff000b2eb2fd60 x24: ffff0000c0005700 [ 3.740982] x23: ffff8000804229e4 x22: ffff800082080000 x21: ffff800081756000 [ 3.741630] x20: fffffd7ff8253360 x19: 00000000000000a8 x18: ffffffffffffffff [ 3.742274] x17: ffff800ab327f000 x16: ffff800083398000 x15: ffff800081756df0 [ 3.742919] x14: 0000000000000000 x13: 205d344320202020 x12: 5b5d373038343337 [ 3.743560] x11: ffff80008357b650 x10: 000000000000005d x9 : 00000000ffffffd0 [ 3.744231] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008237bad0 x6 : c0000000ffff7fff [ 3.744907] x5 : ffff80008237ba78 x4 : ffff8000820bbad0 x3 : 0000000000000001 [ 3.745580] x2 : 68d66547c09f7800 x1 : 68d66547c09f7800 x0 : 0000000000000000 [ 3.746255] Call trace: [ 3.746530] kmem_cache_free+0x444/0x574 [ 3.746931] mem_pool_free+0x44/0xf4 [ 3.747306] free_object_rcu+0xc8/0xdc [ 3.747693] rcu_do_batch+0x234/0x8a4 [ 3.748075] rcu_core+0x230/0x3e4 [ 3.748424] rcu_core_si+0x14/0x1c [ 3.748780] handle_softirqs+0x134/0x378 [ 3.749189] run_ksoftirqd+0x70/0x9c [ 3.749560] smpboot_thread_fn+0x148/0x22c [ 3.749978] kthread+0x10c/0x118 [ 3.750323] ret_from_fork+0x10/0x20 [ 3.750696] ---[ end trace 0000000000000000 ]--- Link: https://lkml.kernel.org/r/20240816013336.17505-1-hao.ge@linux.dev Fixes: 4b8736964640 ("mm/slab: add allocation accounting into slab allocation and free paths") Signed-off-by: Hao Ge Cc: Christoph Lameter Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Kent Overstreet Cc: Pekka Enberg Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/slub.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd65..a77f354f83251 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2116,6 +2116,10 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, if (!mem_alloc_profiling_enabled()) return; + /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + obj_exts = slab_obj_exts(slab); if (!obj_exts) return; From 6dacd79d28842ff01f18b4900d897741aac5999e Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 5 Aug 2024 17:07:50 +0200 Subject: [PATCH 09/17] kexec_file: fix elfcorehdr digest exclusion when CONFIG_CRASH_HOTPLUG=y Fix the condition to exclude the elfcorehdr segment from the SHA digest calculation. The j iterator is an index into the output sha_regions[] array, not into the input image->segment[] array. Once it reaches image->elfcorehdr_index, all subsequent segments are excluded. Besides, if the purgatory segment precedes the elfcorehdr segment, the elfcorehdr may be wrongly included in the calculation. Link: https://lkml.kernel.org/r/20240805150750.170739-1-petr.tesarik@suse.com Fixes: f7cc804a9fd4 ("kexec: exclude elfcorehdr from the segment digest") Signed-off-by: Petr Tesarik Acked-by: Baoquan He Cc: Eric Biederman Cc: Hari Bathini Cc: Sourabh Jain Cc: Eric DeVolder Cc: Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3d64290d24c9a..3eedb8c226ad8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -752,7 +752,7 @@ static int kexec_calculate_store_digests(struct kimage *image) #ifdef CONFIG_CRASH_HOTPLUG /* Exclude elfcorehdr segment to allow future changes via hotplug */ - if (j == image->elfcorehdr_index) + if (i == image->elfcorehdr_index) continue; #endif From f806de88d8f7f8191afd0fd9b94db4cd058e7d4f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 20 Aug 2024 13:54:17 -0400 Subject: [PATCH 10/17] maple_tree: remove rcu_read_lock() from mt_validate() The write lock should be held when validating the tree to avoid updates racing with checks. Holding the rcu read lock during a large tree validation may also cause a prolonged rcu read window and "rcu_preempt detected stalls" warnings. Link: https://lore.kernel.org/all/0000000000001d12d4062005aea1@google.com/ Link: https://lkml.kernel.org/r/20240820175417.2782532-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: syzbot+036af2f0c7338a33b0cd@syzkaller.appspotmail.com Cc: Hillf Danton Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index aa3a5df15b8ef..6df3a8b95808a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7566,14 +7566,14 @@ static void mt_validate_nulls(struct maple_tree *mt) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) + __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); - rcu_read_lock(); mas_start(&mas); if (!mas_is_active(&mas)) - goto done; + return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); @@ -7594,9 +7594,6 @@ void mt_validate(struct maple_tree *mt) mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); -done: - rcu_read_unlock(); - } EXPORT_SYMBOL_GPL(mt_validate); From bfe0857c20c663fcc1592fa4e3a61ca12b07dac9 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 21 Aug 2024 20:26:07 +0100 Subject: [PATCH 11/17] Revert "mm: skip CMA pages when they are not available" This reverts commit 5da226dbfce3 ("mm: skip CMA pages when they are not available") and b7108d66318a ("Multi-gen LRU: skip CMA pages when they are not eligible"). lruvec->lru_lock is highly contended and is held when calling isolate_lru_folios. If the lru has a large number of CMA folios consecutively, while the allocation type requested is not MIGRATE_MOVABLE, isolate_lru_folios can hold the lock for a very long time while it skips those. For FIO workload, ~150million order=0 folios were skipped to isolate a few ZONE_DMA folios [1]. This can cause lockups [1] and high memory pressure for extended periods of time [2]. Remove skipping CMA for MGLRU as well, as it was introduced in sort_folio for the same resaon as 5da226dbfce3a2f44978c2c7cf88166e69a6788b. [1] https://lore.kernel.org/all/CAOUHufbkhMZYz20aM_3rHZ3OcK4m2puji2FGpUpn_-DevGk3Kg@mail.gmail.com/ [2] https://lore.kernel.org/all/ZrssOrcJIDy8hacI@gmail.com/ [usamaarif642@gmail.com: also revert b7108d66318a, per Johannes] Link: https://lkml.kernel.org/r/9060a32d-b2d7-48c0-8626-1db535653c54@gmail.com Link: https://lkml.kernel.org/r/357ac325-4c61-497a-92a3-bdbd230d5ec9@gmail.com Link: https://lkml.kernel.org/r/9060a32d-b2d7-48c0-8626-1db535653c54@gmail.com Fixes: 5da226dbfce3 ("mm: skip CMA pages when they are not available") Signed-off-by: Usama Arif Acked-by: Johannes Weiner Cc: Bharata B Rao Cc: Breno Leitao Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhaoyang Huang Cc: Zhaoyang Huang Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cfa839284b923..bd489c1af2289 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1604,25 +1604,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -#ifdef CONFIG_CMA -/* - * It is waste of effort to scan and reclaim CMA pages if it is not available - * for current allocation context. Kswapd can not be enrolled as it can not - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL - */ -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return !current_is_kswapd() && - gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && - folio_migratetype(folio) == MIGRATE_CMA; -} -#else -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return false; -} -#endif - /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * @@ -1669,8 +1650,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx || - skip_cma(folio, sc)) { + if (folio_zonenum(folio) > sc->reclaim_idx) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; goto move; @@ -4320,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; From a3f6a89c834a4cba0f881da21307b26de3796133 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Aug 2024 17:38:50 +0100 Subject: [PATCH 12/17] scripts: fix gfp-translate after ___GFP_*_BITS conversion to an enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard reports that since 772dd0342727c ("mm: enumerate all gfp flags"), gfp-translate is broken, as the bit numbers are implicit, leaving the shell script unable to extract them. Even more, some bits are now at a variable location, making it double extra hard to parse using a simple shell script. Use a brute-force approach to the problem by generating a small C stub that will use the enum to dump the interesting bits. As an added bonus, we are now able to identify invalid bits for a given configuration. As an added drawback, we cannot parse include files that predate this change anymore. Tough luck. Link: https://lkml.kernel.org/r/20240823163850.3791201-1-maz@kernel.org Fixes: 772dd0342727 ("mm: enumerate all gfp flags") Signed-off-by: Marc Zyngier Reported-by: Richard Weinberger Cc: Petr Tesařík Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- scripts/gfp-translate | 66 ++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/scripts/gfp-translate b/scripts/gfp-translate index 6c9aed17cf563..8385ae0d5af93 100755 --- a/scripts/gfp-translate +++ b/scripts/gfp-translate @@ -62,25 +62,57 @@ if [ "$GFPMASK" = "none" ]; then fi # Extract GFP flags from the kernel source -TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1 -grep -q ___GFP $SOURCE/include/linux/gfp_types.h -if [ $? -eq 0 ]; then - grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE -else - grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE -fi +TMPFILE=`mktemp -t gfptranslate-XXXXXX.c` || exit 1 -# Parse the flags -IFS=" -" echo Source: $SOURCE echo Parsing: $GFPMASK -for LINE in `cat $TMPFILE`; do - MASK=`echo $LINE | awk '{print $3}'` - if [ $(($GFPMASK&$MASK)) -ne 0 ]; then - echo $LINE - fi -done -rm -f $TMPFILE +( + cat < +#include + +// Try to fool compiler.h into not including extra stuff +#define __ASSEMBLY__ 1 + +#include +#include + +static const char *masks[] = { +EOF + + sed -nEe 's/^[[:space:]]+(___GFP_.*)_BIT,.*$/\1/p' $SOURCE/include/linux/gfp_types.h | + while read b; do + cat < 0) + [${b}_BIT] = "$b", +#endif +EOF + done + + cat < $TMPFILE + +${CC:-gcc} -Wall -o ${TMPFILE}.bin -I $SOURCE/include $TMPFILE && ${TMPFILE}.bin + +rm -f $TMPFILE ${TMPFILE}.bin + exit 0 From e399257349098bf7c84343f99efb2bc9c22eb9fd Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Fri, 23 Aug 2024 16:27:06 +0000 Subject: [PATCH 13/17] mm/memcontrol: respect zswap.writeback setting from parent cg too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the behavior of zswap.writeback wrt. the cgroup hierarchy seems a bit odd. Unlike zswap.max, it doesn't honor the value from parent cgroups. This surfaced when people tried to globally disable zswap writeback, i.e. reserve physical swap space only for hibernation [1] - disabling zswap.writeback only for the root cgroup results in subcgroups with zswap.writeback=1 still performing writeback. The inconsistency became more noticeable after I introduced the MemoryZSwapWriteback= systemd unit setting [2] for controlling the knob. The patch assumed that the kernel would enforce the value of parent cgroups. It could probably be workarounded from systemd's side, by going up the slice unit tree and inheriting the value. Yet I think it's more sensible to make it behave consistently with zswap.max and friends. [1] https://wiki.archlinux.org/title/Power_management/Suspend_and_hibernate#Disable_zswap_writeback_to_use_the_swap_space_only_for_hibernation [2] https://github.com/systemd/systemd/pull/31734 Link: https://lkml.kernel.org/r/20240823162506.12117-1-me@yhndnzj.com Fixes: 501a06fe8e4c ("zswap: memcontrol: implement zswap writeback disabling") Signed-off-by: Mike Yuan Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 7 ++++--- mm/memcontrol.c | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 86311c2907cd3..95c18bc170834 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1717,9 +1717,10 @@ The following nested keys are defined. entries fault back in or are written out to disk. memory.zswap.writeback - A read-write single value file. The default value is "1". The - initial value of the root cgroup is 1, and when a new cgroup is - created, it inherits the current value of its parent. + A read-write single value file. The default value is "1". + Note that this setting is hierarchical, i.e. the writeback would be + implicitly disabled for child cgroups if the upper hierarchy + does so. When this is set to 0, all swapping attempts to swapping devices are disabled. This included both zswap writebacks, and swapping due diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f29157288b7dd..d563fb515766b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3613,8 +3613,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg1_soft_limit_reset(memcg); #ifdef CONFIG_ZSWAP memcg->zswap_max = PAGE_COUNTER_MAX; - WRITE_ONCE(memcg->zswap_writeback, - !parent || READ_ONCE(parent->zswap_writeback)); + WRITE_ONCE(memcg->zswap_writeback, true); #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { @@ -5320,7 +5319,14 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ - return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); + if (!zswap_is_enabled()) + return true; + + for (; memcg; memcg = parent_mem_cgroup(memcg)) + if (!READ_ONCE(memcg->zswap_writeback)) + return false; + + return true; } static u64 zswap_current_read(struct cgroup_subsys_state *css, From 5e9784e997620af7c1399029282f5d6964b41942 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 26 Aug 2024 00:36:49 +0800 Subject: [PATCH 14/17] codetag: debug: mark codetags for poisoned page as empty When PG_hwpoison pages are freed they are treated differently in free_pages_prepare() and instead of being released they are isolated. Page allocation tag counters are decremented at this point since the page is considered not in use. Later on when such pages are released by unpoison_memory(), the allocation tag counters will be decremented again and the following warning gets reported: [ 113.930443][ T3282] ------------[ cut here ]------------ [ 113.931105][ T3282] alloc_tag was not set [ 113.931576][ T3282] WARNING: CPU: 2 PID: 3282 at ./include/linux/alloc_tag.h:130 pgalloc_tag_sub.part.66+0x154/0x164 [ 113.932866][ T3282] Modules linked in: hwpoison_inject fuse ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_man4 [ 113.941638][ T3282] CPU: 2 UID: 0 PID: 3282 Comm: madvise11 Kdump: loaded Tainted: G W 6.11.0-rc4-dirty #18 [ 113.943003][ T3282] Tainted: [W]=WARN [ 113.943453][ T3282] Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022 [ 113.944378][ T3282] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 113.945319][ T3282] pc : pgalloc_tag_sub.part.66+0x154/0x164 [ 113.946016][ T3282] lr : pgalloc_tag_sub.part.66+0x154/0x164 [ 113.946706][ T3282] sp : ffff800087093a10 [ 113.947197][ T3282] x29: ffff800087093a10 x28: ffff0000d7a9d400 x27: ffff80008249f0a0 [ 113.948165][ T3282] x26: 0000000000000000 x25: ffff80008249f2b0 x24: 0000000000000000 [ 113.949134][ T3282] x23: 0000000000000001 x22: 0000000000000001 x21: 0000000000000000 [ 113.950597][ T3282] x20: ffff0000c08fcad8 x19: ffff80008251e000 x18: ffffffffffffffff [ 113.952207][ T3282] x17: 0000000000000000 x16: 0000000000000000 x15: ffff800081746210 [ 113.953161][ T3282] x14: 0000000000000000 x13: 205d323832335420 x12: 5b5d353031313339 [ 113.954120][ T3282] x11: ffff800087093500 x10: 000000000000005d x9 : 00000000ffffffd0 [ 113.955078][ T3282] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008236ba90 x6 : c0000000ffff7fff [ 113.956036][ T3282] x5 : ffff000b34bf4dc8 x4 : ffff8000820aba90 x3 : 0000000000000001 [ 113.956994][ T3282] x2 : ffff800ab320f000 x1 : 841d1e35ac932e00 x0 : 0000000000000000 [ 113.957962][ T3282] Call trace: [ 113.958350][ T3282] pgalloc_tag_sub.part.66+0x154/0x164 [ 113.959000][ T3282] pgalloc_tag_sub+0x14/0x1c [ 113.959539][ T3282] free_unref_page+0xf4/0x4b8 [ 113.960096][ T3282] __folio_put+0xd4/0x120 [ 113.960614][ T3282] folio_put+0x24/0x50 [ 113.961103][ T3282] unpoison_memory+0x4f0/0x5b0 [ 113.961678][ T3282] hwpoison_unpoison+0x30/0x48 [hwpoison_inject] [ 113.962436][ T3282] simple_attr_write_xsigned.isra.34+0xec/0x1cc [ 113.963183][ T3282] simple_attr_write+0x38/0x48 [ 113.963750][ T3282] debugfs_attr_write+0x54/0x80 [ 113.964330][ T3282] full_proxy_write+0x68/0x98 [ 113.964880][ T3282] vfs_write+0xdc/0x4d0 [ 113.965372][ T3282] ksys_write+0x78/0x100 [ 113.965875][ T3282] __arm64_sys_write+0x24/0x30 [ 113.966440][ T3282] invoke_syscall+0x7c/0x104 [ 113.966984][ T3282] el0_svc_common.constprop.1+0x88/0x104 [ 113.967652][ T3282] do_el0_svc+0x2c/0x38 [ 113.968893][ T3282] el0_svc+0x3c/0x1b8 [ 113.969379][ T3282] el0t_64_sync_handler+0x98/0xbc [ 113.969980][ T3282] el0t_64_sync+0x19c/0x1a0 [ 113.970511][ T3282] ---[ end trace 0000000000000000 ]--- To fix this, clear the page tag reference after the page got isolated and accounted for. Link: https://lkml.kernel.org/r/20240825163649.33294-1-hao.ge@linux.dev Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Hao Ge Reviewed-by: Miaohe Lin Acked-by: Suren Baghdasaryan Cc: David Hildenbrand Cc: Hao Ge Cc: Kent Overstreet Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: [6.10+] Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c565de8f48e9d..91ace8ca97e21 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1054,6 +1054,13 @@ __always_inline bool free_pages_prepare(struct page *page, reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); + + /* + * The page is isolated and accounted for. + * Mark the codetag as empty to avoid accounting error + * when the page is freed by unpoison_memory(). + */ + clear_page_tag_ref(page); return false; } From 4f295229b279145bdc667c58f62e89f5968e12fb Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Fri, 30 Aug 2024 11:56:58 +0200 Subject: [PATCH 15/17] mailmap: update entry for Jan Kuliga Soon I won't be able to use my current email address. Link: https://lkml.kernel.org/r/20240830095658.1203198-1-jankul@alatek.krakow.pl Signed-off-by: Jan Kuliga Cc: David S. Miller Cc: Matthieu Baerts (NGI0) Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index caf46a652f154..a7969e2d590a5 100644 --- a/.mailmap +++ b/.mailmap @@ -269,6 +269,7 @@ James Ketrenos Jan Glauber Jan Glauber Jan Glauber +Jan Kuliga Jarkko Sakkinen Jarkko Sakkinen Jarkko Sakkinen From 409faf8c97d5abb0597ea43e99c8b3dd8dbe99e3 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Thu, 29 Aug 2024 21:06:33 +0800 Subject: [PATCH 16/17] mm: vmalloc: optimize vmap_lazy_nr arithmetic when purging each vmap_area When running the vmalloc stress on a 448-core system, observe the average latency of purge_vmap_node() is about 2 seconds by using the eBPF/bcc 'funclatency.py' tool [1]. # /your-git-repo/bcc/tools/funclatency.py -u purge_vmap_node & pid1=$! && sleep 8 && modprobe test_vmalloc nr_threads=$(nproc) run_test_mask=0x7; kill -SIGINT $pid1 usecs : count distribution 0 -> 1 : 0 | | 2 -> 3 : 29 | | 4 -> 7 : 19 | | 8 -> 15 : 56 | | 16 -> 31 : 483 |**** | 32 -> 63 : 1548 |************ | 64 -> 127 : 2634 |********************* | 128 -> 255 : 2535 |********************* | 256 -> 511 : 1776 |************** | 512 -> 1023 : 1015 |******** | 1024 -> 2047 : 573 |**** | 2048 -> 4095 : 488 |**** | 4096 -> 8191 : 1091 |********* | 8192 -> 16383 : 3078 |************************* | 16384 -> 32767 : 4821 |****************************************| 32768 -> 65535 : 3318 |*************************** | 65536 -> 131071 : 1718 |************** | 131072 -> 262143 : 2220 |****************** | 262144 -> 524287 : 1147 |********* | 524288 -> 1048575 : 1179 |********* | 1048576 -> 2097151 : 822 |****** | 2097152 -> 4194303 : 906 |******* | 4194304 -> 8388607 : 2148 |***************** | 8388608 -> 16777215 : 4497 |************************************* | 16777216 -> 33554431 : 289 |** | avg = 2041714 usecs, total: 78381401772 usecs, count: 38390 The worst case is over 16-33 seconds, so soft lockup is triggered [2]. [Root Cause] 1) Each purge_list has the long list. The following shows the number of vmap_area is purged. crash> p vmap_nodes vmap_nodes = $27 = (struct vmap_node *) 0xff2de5a900100000 crash> vmap_node 0xff2de5a900100000 128 | grep nr_purged nr_purged = 663070 ... nr_purged = 821670 nr_purged = 692214 nr_purged = 726808 ... 2) atomic_long_sub() employs the 'lock' prefix to ensure the atomic operation when purging each vmap_area. However, the iteration is over 600000 vmap_area (See 'nr_purged' above). Here is objdump output: $ objdump -D vmlinux ffffffff813e8c80 : ... ffffffff813e8d70: f0 48 29 2d 68 0c bb lock sub %rbp,0x2bb0c68(%rip) ... Quote from "Instruction tables" pdf file [3]: Instructions with a LOCK prefix have a long latency that depends on cache organization and possibly RAM speed. If there are multiple processors or cores or direct memory access (DMA) devices, then all locked instructions will lock a cache line for exclusive access, which may involve RAM access. A LOCK prefix typically costs more than a hundred clock cycles, even on single-processor systems. That's why the latency of purge_vmap_node() dramatically increases on a many-core system: One core is busy on purging each vmap_area of the *long* purge_list and executing atomic_long_sub() for each vmap_area, while other cores free vmalloc allocations and execute atomic_long_add_return() in free_vmap_area_noflush(). [Solution] Employ a local variable to record the total purged pages, and execute atomic_long_sub() after the traversal of the purge_list is done. The experiment result shows the latency improvement is 99%. [Experiment Result] 1) System Configuration: Three servers (with HT-enabled) are tested. * 72-core server: 3rd Gen Intel Xeon Scalable Processor*1 * 192-core server: 5th Gen Intel Xeon Scalable Processor*2 * 448-core server: AMD Zen 4 Processor*2 2) Kernel Config * CONFIG_KASAN is disabled 3) The data in column "w/o patch" and "w/ patch" * Unit: micro seconds (us) * Each data is the average of 3-time measurements System w/o patch (us) w/ patch (us) Improvement (%) --------------- -------------- ------------- ------------- 72-core server 2194 14 99.36% 192-core server 143799 1139 99.21% 448-core server 1992122 6883 99.65% [1] https://github.com/iovisor/bcc/blob/master/tools/funclatency.py [2] https://gist.github.com/AdrianHuang/37c15f67b45407b83c2d32f918656c12 [3] https://www.agner.org/optimize/instruction_tables.pdf Link: https://lkml.kernel.org/r/20240829130633.2184-1-ahuang12@lenovo.com Signed-off-by: Adrian Huang Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ac53d46ac8a5c..a0df1e2e155a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2191,6 +2191,7 @@ static void purge_vmap_node(struct work_struct *work) { struct vmap_node *vn = container_of(work, struct vmap_node, purge_work); + unsigned long nr_purged_pages = 0; struct vmap_area *va, *n_va; LIST_HEAD(local_list); @@ -2208,7 +2209,7 @@ static void purge_vmap_node(struct work_struct *work) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); - atomic_long_sub(nr, &vmap_lazy_nr); + nr_purged_pages += nr; vn->nr_purged++; if (is_vn_id_valid(vn_id) && !vn->skip_populate) @@ -2219,6 +2220,8 @@ static void purge_vmap_node(struct work_struct *work) list_add(&va->list, &local_list); } + atomic_long_sub(nr_purged_pages, &vmap_lazy_nr); + reclaim_list_global(&local_list); } From 052a45c1cb1b32f05dd63a295d65496d8b403283 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 28 Aug 2024 16:15:36 -0700 Subject: [PATCH 17/17] alloc_tag: fix allocation tag reporting when CONFIG_MODULES=n codetag_module_init() is used to initialize sections containing allocation tags. This function is used to initialize module sections as well as core kernel sections, in which case the module parameter is set to NULL. This function has to be called even when CONFIG_MODULES=n to initialize core kernel allocation tag sections. When CONFIG_MODULES=n, this function is a NOP, which is wrong. This leads to /proc/allocinfo reported as empty. Fix this by making it independent of CONFIG_MODULES. Link: https://lkml.kernel.org/r/20240828231536.1770519-1-surenb@google.com Fixes: 916cc5167cc6 ("lib: code tagging framework") Signed-off-by: Suren Baghdasaryan Cc: David Hildenbrand Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10+] Signed-off-by: Andrew Morton --- lib/codetag.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/codetag.c b/lib/codetag.c index 5ace625f2328f..afa8a2d4f3173 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -125,7 +125,6 @@ static inline size_t range_size(const struct codetag_type *cttype, cttype->desc.tag_size; } -#ifdef CONFIG_MODULES static void *get_symbol(struct module *mod, const char *prefix, const char *name) { DECLARE_SEQ_BUF(sb, KSYM_NAME_LEN); @@ -155,6 +154,15 @@ static struct codetag_range get_section_range(struct module *mod, }; } +static const char *get_mod_name(__maybe_unused struct module *mod) +{ +#ifdef CONFIG_MODULES + if (mod) + return mod->name; +#endif + return "(built-in)"; +} + static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { struct codetag_range range; @@ -164,8 +172,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) range = get_section_range(mod, cttype->desc.section); if (!range.start || !range.stop) { pr_warn("Failed to load code tags of type %s from the module %s\n", - cttype->desc.section, - mod ? mod->name : "(built-in)"); + cttype->desc.section, get_mod_name(mod)); return -EINVAL; } @@ -199,6 +206,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) return 0; } +#ifdef CONFIG_MODULES void codetag_load_module(struct module *mod) { struct codetag_type *cttype; @@ -248,9 +256,6 @@ bool codetag_unload_module(struct module *mod) return unload_ok; } - -#else /* CONFIG_MODULES */ -static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { return 0; } #endif /* CONFIG_MODULES */ struct codetag_type *