From 722eef348744d5badd0b29b518ba045d151375e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 24 Aug 2021 09:58:51 +1000 Subject: [PATCH 001/365] mm/filemap.c: remove bogus VM_BUG_ON It is not safe to check page->index without holding the page lock. It can be changed if the page is moved between the swap cache and the page cache for a shmem file, for example. There is a VM_BUG_ON below which checks page->index is correct after taking the page lock. Link: https://lkml.kernel.org/r/20210818144932.940640-1-willy@infradead.org Fixes: 5c211ba29deb ("mm: add and use find_lock_entries") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index d1458ecf2f51e..34de0b14aaa90 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2038,7 +2038,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, if (!xa_is_value(page)) { if (page->index < start) goto put; - VM_BUG_ON_PAGE(page->index != xas.xa_index, page); if (page->index + thp_nr_pages(page) - 1 > end) goto put; if (!trylock_page(page)) From 688431b762a6fe965a8fcdf5be4a25b189e12b6b Mon Sep 17 00:00:00 2001 From: Toshiki Fukasawa Date: Tue, 24 Aug 2021 09:58:52 +1000 Subject: [PATCH 002/365] /proc/kpageflags: prevent an integer overflow in stable_page_flags() stable_page_flags() returns kpageflags info in u64, but it uses "1 << KPF_*" internally which is considered as int. This type mismatch causes no visible problem now, but it will if you set bit 32 or more as done in a subsequent patch. So use BIT_ULL in order to avoid future overflow issues. Link: http://lkml.kernel.org/r/20190725023100.31141-2-t-fukasawa@vx.jp.nec.com Signed-off-by: Toshiki Fukasawa Cc: Michal Hocko Cc: Dan Williams Cc: Alexey Dobriyan Cc: Christoph Hellwig Cc: Naoya Horiguchi Cc: Junichi Nomura Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde1..265f4fca15e29 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,7 @@ u64 stable_page_flags(struct page *page) * it differentiates a memory hole from a page with no flags */ if (!page) - return 1 << KPF_NOPAGE; + return BIT_ULL(KPF_NOPAGE); k = page->flags; u = 0; @@ -127,22 +127,22 @@ u64 stable_page_flags(struct page *page) * simple test in page_mapped() is not enough. */ if (!PageSlab(page) && page_mapped(page)) - u |= 1 << KPF_MMAP; + u |= BIT_ULL(KPF_MMAP); if (PageAnon(page)) - u |= 1 << KPF_ANON; + u |= BIT_ULL(KPF_ANON); if (PageKsm(page)) - u |= 1 << KPF_KSM; + u |= BIT_ULL(KPF_KSM); /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; + u |= BIT_ULL(KPF_COMPOUND_HEAD); if (PageTail(page)) - u |= 1 << KPF_COMPOUND_TAIL; + u |= BIT_ULL(KPF_COMPOUND_TAIL); if (PageHuge(page)) - u |= 1 << KPF_HUGE; + u |= BIT_ULL(KPF_HUGE); /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it @@ -153,14 +153,13 @@ u64 stable_page_flags(struct page *page) struct page *head = compound_head(page); if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_THP); else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_ZERO_PAGE); + u |= BIT_ULL(KPF_THP); } } else if (is_zero_pfn(page_to_pfn(page))) - u |= 1 << KPF_ZERO_PAGE; - + u |= BIT_ULL(KPF_ZERO_PAGE); /* * Caveats on high order pages: page->_refcount will only be set @@ -168,23 +167,23 @@ u64 stable_page_flags(struct page *page) * SLOB won't set PG_slab at all on compound pages. */ if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); else if (page_count(page) == 0 && is_free_buddy_page(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); if (PageOffline(page)) - u |= 1 << KPF_OFFLINE; + u |= BIT_ULL(KPF_OFFLINE); if (PageTable(page)) - u |= 1 << KPF_PGTABLE; + u |= BIT_ULL(KPF_PGTABLE); if (page_is_idle(page)) - u |= 1 << KPF_IDLE; + u |= BIT_ULL(KPF_IDLE); u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; + u |= BIT_ULL(KPF_SLAB); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); @@ -197,7 +196,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); if (PageSwapCache(page)) - u |= 1 << KPF_SWAPCACHE; + u |= BIT_ULL(KPF_SWAPCACHE); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); From 2e36383adbf563a7b943fc093b44e05c060e86cc Mon Sep 17 00:00:00 2001 From: Toshiki Fukasawa Date: Tue, 24 Aug 2021 09:58:52 +1000 Subject: [PATCH 003/365] /proc/kpageflags: do not use uninitialized struct pages A kernel panic was observed during reading /proc/kpageflags for first few pfns allocated by pmem namespace: BUG: unable to handle page fault for address: fffffffffffffffe [ 114.495280] #PF: supervisor read access in kernel mode [ 114.495738] #PF: error_code(0x0000) - not-present page [ 114.496203] PGD 17120e067 P4D 17120e067 PUD 171210067 PMD 0 [ 114.496713] Oops: 0000 [#1] SMP PTI [ 114.497037] CPU: 9 PID: 1202 Comm: page-types Not tainted 5.3.0-rc1 #1 [ 114.497621] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014 [ 114.498706] RIP: 0010:stable_page_flags+0x27/0x3f0 [ 114.499142] Code: 82 66 90 66 66 66 66 90 48 85 ff 0f 84 d1 03 00 00 41 54 55 48 89 fd 53 48 8b 57 08 48 8b 1f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 02 0f 84 57 03 00 00 45 31 e4 48 8b 55 08 48 89 ef [ 114.500788] RSP: 0018:ffffa5e601a0fe60 EFLAGS: 00010202 [ 114.501373] RAX: fffffffffffffffe RBX: ffffffffffffffff RCX: 0000000000000000 [ 114.502009] RDX: 0000000000000001 RSI: 00007ffca13a7310 RDI: ffffd07489000000 [ 114.502637] RBP: ffffd07489000000 R08: 0000000000000001 R09: 0000000000000000 [ 114.503270] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000240000 [ 114.503896] R13: 0000000000080000 R14: 00007ffca13a7310 R15: ffffa5e601a0ff08 [ 114.504530] FS: 00007f0266c7f540(0000) GS:ffff962dbbac0000(0000) knlGS:0000000000000000 [ 114.505245] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 114.505754] CR2: fffffffffffffffe CR3: 000000023a204000 CR4: 00000000000006e0 [ 114.506401] Call Trace: [ 114.506660] kpageflags_read+0xb1/0x130 [ 114.507051] proc_reg_read+0x39/0x60 [ 114.507387] vfs_read+0x8a/0x140 [ 114.507686] ksys_pread64+0x61/0xa0 [ 114.508021] do_syscall_64+0x5f/0x1a0 [ 114.508372] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 114.508844] RIP: 0033:0x7f0266ba426b The reason for the panic is that stable_page_flags() which parses the page flags uses uninitialized struct pages reserved by the ZONE_DEVICE driver. Earlier approach to fix this was discussed here: https://marc.info/?l=linux-mm&m=152964770000672&w=2 This is another approach. To avoid using the uninitialized struct page, immediately return with KPF_RESERVED at the beginning of stable_page_flags() if the page is reserved by ZONE_DEVICE driver. Dan said: : The nvdimm implementation uses vmem_altmap to arrange for the 'struct : page' array to be allocated from a reservation of a pmem namespace. A : namespace in this mode contains an info-block that consumes the first : 8K of the namespace capacity, capacity designated for page mapping, : capacity for padding the start of data to optionally 4K, 2MB, or 1GB : (on x86), and then the namespace data itself. The implementation : specifies a section aligned (now sub-section aligned) address to : arch_add_memory() to establish the linear mapping to map the metadata, : and then vmem_altmap indicates to memmap_init_zone() which pfns : represent data. The implementation only specifies enough 'struct page' : capacity for pfn_to_page() to operate on the data space, not the : namespace metadata space. : : The proposal to validate ZONE_DEVICE pfns against the altmap seems the : right approach to me. Link: http://lkml.kernel.org/r/20190725023100.31141-3-t-fukasawa@vx.jp.nec.com Signed-off-by: Toshiki Fukasawa Cc: Alexey Dobriyan Cc: Christoph Hellwig Cc: Dan Williams Cc: Junichi Nomura Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 3 +++ include/linux/memremap.h | 6 ++++++ mm/memremap.c | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/proc/page.c b/fs/proc/page.c index 265f4fca15e29..4dcbcd506cb6e 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -117,6 +117,9 @@ u64 stable_page_flags(struct page *page) if (!page) return BIT_ULL(KPF_NOPAGE); + if (pfn_zone_device_reserved(page_to_pfn(page))) + return BIT_ULL(KPF_RESERVED); + k = page->flags; u = 0; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c0e9d35889e8d..119f130ef8f10 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -131,6 +131,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -143,6 +144,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else +static inline bool pfn_zone_device_reserved(unsigned long pfn) +{ + return false; +} + static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { diff --git a/mm/memremap.c b/mm/memremap.c index 15a074ffb8d73..805d761740c42 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -109,6 +109,26 @@ static unsigned long pfn_next(unsigned long pfn) return pfn + 1; } +/* + * This returns true if the page is reserved by ZONE_DEVICE driver. + */ +bool pfn_zone_device_reserved(unsigned long pfn) +{ + struct dev_pagemap *pgmap; + struct vmem_altmap *altmap; + bool ret = false; + + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ret; + altmap = pgmap_altmap(pgmap); + if (altmap && pfn < (altmap->base_pfn + altmap->reserve)) + ret = true; + put_dev_pagemap(pgmap); + + return ret; +} + #define for_each_device_pfn(pfn, map, i) \ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) From 0a5b0e0c4efea3cbd71594b5e259efeeec306452 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 24 Aug 2021 09:58:52 +1000 Subject: [PATCH 004/365] procfs: prevent unpriveleged processes accessing fdinfo dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The file permissions on the fdinfo dir from were changed from S_IRUSR|S_IXUSR to S_IRUGO|S_IXUGO, and a PTRACE_MODE_READ check was added for opening the fdinfo files [1]. However, the ptrace permission check was not added to the directory, allowing anyone to get the open FD numbers by reading the fdinfo directory. Add the missing ptrace permission check for opening the fdinfo directory. [1] https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com Link: https://lkml.kernel.org/r/20210713162008.1056986-1-kaleshsingh@google.com Fixes: 7bc3fa0172a4 ("procfs: allow reading fdinfo with PTRACE_MODE_READ") Signed-off-by: Kalesh Singh Cc: Kees Cook Cc: Eric W. Biederman Cc: Christian Brauner Cc: Christian König Cc: Suren Baghdasaryan Cc: Hridya Valsaraju Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/fd.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b312..913bef0d2a36c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ static int seq_show(struct seq_file *m, void *v) return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, From 1849f90e238322c8beef058aecb03c7a8c45ba23 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 24 Aug 2021 09:58:52 +1000 Subject: [PATCH 005/365] ia64: fix typo in a comment s/when when/when/ Link: https://lkml.kernel.org/r/20210817112500.12848-1-wangborong@cdjrlc.com Signed-off-by: Jason Wang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/kernel/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index e2af6b172200e..96d13cb7c19f0 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -906,6 +906,6 @@ EXPORT_SYMBOL(acpi_unregister_ioapic); /* * acpi_suspend_lowlevel() - save kernel state and suspend. * - * TBD when when IA64 starts to support suspend... + * TBD when IA64 starts to support suspend... */ int acpi_suspend_lowlevel(void) { return 0; } From f143caa0a38852a1e41979f17c4d9a4159b4656e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2021 09:58:52 +1000 Subject: [PATCH 006/365] ocfs2: remove an unnecessary condition The case where "tmp_oh" is NULL is handled at the start of the function. At this point we know it's non-NULL so this will always return 1. Link: https://lkml.kernel.org/r/YOcItgIXtisi3MaO@mwanda Signed-off-by: Dan Carpenter Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Larry Chen Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/dlmglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 48fd369c29a4b..33fbdc823278c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2721,7 +2721,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode, return status; } } - return tmp_oh ? 1 : 0; + return 1; } void ocfs2_inode_unlock_tracker(struct inode *inode, From 009aa8da5b424d673013304659098f72797960ad Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Tue, 24 Aug 2021 09:58:53 +1000 Subject: [PATCH 007/365] ocfs2: quota_local: fix possible uninitialized-variable access in ocfs2_local_read_info() A memory block is allocated through kmalloc(), and its return value is assigned to the pointer oinfo. However, oinfo->dqi_gqinode is not initialized but it is accessed in: iput(oinfo->dqi_gqinode); To fix this possible uninitialized-variable access, assign NULL to oinfo->dqi_gqinode, and add ocfs2_qinfo_lock_res_init() behind the assignment in ocfs2_local_read_info(). Remove ocfs2_qinfo_lock_res_init() in ocfs2_global_read_info(). Link: https://lkml.kernel.org/r/20210804031832.57154-1-islituo@gmail.com Signed-off-by: Tuo Li Reported-by: TOTE Robot Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/quota_global.c | 1 - fs/ocfs2/quota_local.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index eda83487c9ec7..f033de733adb3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) } oinfo->dqi_gi.dqi_sb = sb; oinfo->dqi_gi.dqi_type = type; - ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; oinfo->dqi_gqi_bh = NULL; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c22..0e4b16d4c037f 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_gqinode = NULL; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; From febd5117e5ae79c4c24b0bc163d82d4b2ba881af Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 24 Aug 2021 09:58:53 +1000 Subject: [PATCH 008/365] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea2..f8bbb22cc60ba 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7f6355cbb5875..a9a0c7c37e8ed 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c4..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From 146777b1249fd4a91323558be0ba037968d821f7 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Tue, 24 Aug 2021 09:58:53 +1000 Subject: [PATCH 009/365] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f8bbb22cc60ba..a5c975715aa05 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2027,8 +2033,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From d08957d072cdb3dbb1b4500b0b8438589c5a0a3c Mon Sep 17 00:00:00 2001 From: Wangyan Date: Tue, 24 Aug 2021 09:58:53 +1000 Subject: [PATCH 010/365] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a5c975715aa05..0939186f010e3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -640,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 07e83239e15a694fcf9a37bb11573f035a8ec960 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Tue, 24 Aug 2021 09:58:53 +1000 Subject: [PATCH 011/365] arch/csky/kernel/probes/kprobes.c: fix bugon.cocci warnings Use BUG_ON instead of a if condition followed by BUG. Generated by: scripts/coccinelle/misc/bugon.cocci Link: https://lkml.kernel.org/r/alpine.DEB.2.22.394.2107061049150.7197@hadrien Fixes: 7d37cb2c912d ("lib: fix kconfig dependency on ARCH_WANT_FRAME_POINTERS") Signed-off-by: kernel test robot Signed-off-by: Julia Lawall Reported-by: kernel test robot Cc: Julian Braha Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/csky/kernel/probes/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 68b22b499aebf..8fffa34d4e1c5 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -283,8 +283,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) * normal page fault. */ regs->pc = (unsigned long) cur->addr; - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); From f41e44d7b5e34039006e6cadf29cb4252c40a43c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:54 +1000 Subject: [PATCH 012/365] mm, slub: don't call flush_all() from slab_debug_trace_open() Patch series "SLUB: reduce irq disabled scope and make it RT compatible", v4. This series was initially inspired by Mel's pcplist local_lock rewrite, and also interest to better understand SLUB's locking and the new primitives and RT variants and implications. It should make SLUB more preemption-friendly, especially for RT, hopefully without noticeable regressions, as the fast paths are not affected. The RFC/v1 version got basic performance screening by Mel that didn't show major regressions. Mike's testing with hackbench of v2 on !RT reported negligible differences [6]: virgin(ish) tip 5.13.0.g60ab3ed-tip 7,320.67 msec task-clock # 7.792 CPUs utilized ( +- 0.31% ) 221,215 context-switches # 0.030 M/sec ( +- 3.97% ) 16,234 cpu-migrations # 0.002 M/sec ( +- 4.07% ) 13,233 page-faults # 0.002 M/sec ( +- 0.91% ) 27,592,205,252 cycles # 3.769 GHz ( +- 0.32% ) 8,309,495,040 instructions # 0.30 insn per cycle ( +- 0.37% ) 1,555,210,607 branches # 212.441 M/sec ( +- 0.42% ) 5,484,209 branch-misses # 0.35% of all branches ( +- 2.13% ) 0.93949 +- 0.00423 seconds time elapsed ( +- 0.45% ) 0.94608 +- 0.00384 seconds time elapsed ( +- 0.41% ) (repeat) 0.94422 +- 0.00410 seconds time elapsed ( +- 0.43% ) 5.13.0.g60ab3ed-tip +slub-local-lock-v2r3 7,343.57 msec task-clock # 7.776 CPUs utilized ( +- 0.44% ) 223,044 context-switches # 0.030 M/sec ( +- 3.02% ) 16,057 cpu-migrations # 0.002 M/sec ( +- 4.03% ) 13,164 page-faults # 0.002 M/sec ( +- 0.97% ) 27,684,906,017 cycles # 3.770 GHz ( +- 0.45% ) 8,323,273,871 instructions # 0.30 insn per cycle ( +- 0.28% ) 1,556,106,680 branches # 211.901 M/sec ( +- 0.31% ) 5,463,468 branch-misses # 0.35% of all branches ( +- 1.33% ) 0.94440 +- 0.00352 seconds time elapsed ( +- 0.37% ) 0.94830 +- 0.00228 seconds time elapsed ( +- 0.24% ) (repeat) 0.93813 +- 0.00440 seconds time elapsed ( +- 0.47% ) (repeat) RT configs showed some throughput regressions, but that's expected tradeoff for the preemption improvements through the RT mutex. It didn't prevent the v2 to be incorporated to the 5.13 RT tree [7], leading to testing exposure and bugfixes. Before the series, SLUB is lockless in both allocation and free fast paths, but elsewhere, it's disabling irqs for considerable periods of time - especially in allocation slowpath and the bulk allocation, where IRQs are re-enabled only when a new page from the page allocator is needed, and the context allows blocking. The irq disabled sections can then include deactivate_slab() which walks a full freelist and frees the slab back to page allocator or unfreeze_partials() going through a list of percpu partial slabs. The RT tree currently has some patches mitigating these, but we can do much better in mainline too. Patches 1-6 are straightforward improvements or cleanups that could exist outside of this series too, but are prerequsities. Patches 7-10 are also preparatory code changes without functional changes, but not so useful without the rest of the series. Patch 11 simplifies the fast paths on systems with preemption, based on (hopefully correct) observation that the current loops to verify tid are unnecessary. Patches 12-21 focus on reducing irq disabled scope in the allocation slowpath. Patch 12 moves disabling of irqs into ___slab_alloc() from its callers, which are the allocation slowpath, and bulk allocation. Instead these callers only disable preemption to stabilize the cpu. The following patches then gradually reduce the scope of disabled irqs in ___slab_alloc() and the functions called from there. As of patch 15, the re-enabling of irqs based on gfp flags before calling the page allocator is removed from allocate_slab(). As of patch 18, it's possible to reach the page allocator (in case of existing slabs depleted) without disabling and re-enabling irqs a single time. Pathces 22-27 reduce the scope of disabled irqs in functions related to unfreezing percpu partial slab. Patch 28 is preparatory. Patch 29 is adopted from the RT tree and converts the flushing of percpu slabs on all cpus from using IPI to workqueue, so that the processing isn't happening with irqs disabled in the IPI handler. The flushing is not performance critical so it should be acceptable. Patch 30 also comes from RT tree and makes object_map_lock RT compatible. Patches 31-32 make slab_lock irq-safe on RT where we cannot rely on having irq disabled from the list_lock spin lock usage. Patch 33 changes kmem_cache_cpu->partial handling in put_cpu_partial() from cmpxchg loop to a short irq disabled section, which is used by all other code modifying the field. This addresses a theoretical race scenario pointed out by Jann, and makes the critical section safe wrt with RT local_lock semantics after the conversion in patch 35. Patch 34 changes preempt disable to migrate disable, so that the nested list_lock spinlock is safe to take on RT. Because migrate_disable() is a function call even on !RT, a small set of private wrappers is introduced to keep using the cheaper preempt_disable() on !PREEMPT_RT configurations. As of this patch, SLUB should be compatible with RT's lock semantics, to the best of my knowledge. Finally, patch 35 changes irq disabled sections that protect kmem_cache_cpu fields in the slow paths, with a local lock. However on PREEMPT_RT it means the lockless fast paths can now preempt slow paths which don't expect that, so the local lock has to be taken also in the fast paths and they are no longer lockless. It's up to RT folks to decide if this is a good tradeoff. The patch also updates the locking documentation in the file's comment. The main results of this series: * irq disabling is only done for minimum amount of time needed to protect the kmem_cache_cpu data and as part of spin lock, local lock and bit lock operations to make them irq-safe * SLUB should be fully PREEMPT_RT compatible This should have obvious implications for better preemptibility, especially on RT. Some details are different than how the previous SLUB RT tree patches were implemented: mm: sl[au]b: Change list_lock to raw_spinlock_t [2] - the SLAB part can be dropped as a different patch restricts RT to SLUB anyway. And after this series the list_lock in SLUB is never used with irqs disabled before taking the lock so it doesn't have to be converted to raw_spinlock_t. mm: slub: Move discard_slab() invocations out of IRQ-off sections [3] should be unnecessary as this series does move these invocations outside irq disabled sections in a different way. The remaining patches to upstream from the RT tree are small ones related to KConfig. The patch that restricts PREEMPT_RT to SLUB (not SLAB or SLOB) makes sense. The patch that disables CONFIG_SLUB_CPU_PARTIAL with PREEMPT_RT could perhaps be re-evaluated as the series addresses some latency issues with it. [1] https://lore.kernel.org/lkml/20210524233946.20352-1-vbabka@suse.cz/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/tree/patches/0001-mm-sl-au-b-Change-list_lock-to-raw_spinlock_t.patch?h=linux-5.12.y-rt-patches [3] https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/tree/patches/0004-mm-slub-Move-discard_slab-invocations-out-of-IRQ-off.patch?h=linux-5.12.y-rt-patches [4] https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/tree/patches/0005-mm-slub-Move-flush_cpu_slab-invocations-__free_slab-.patch?h=linux-5.12.y-rt-patches [5] https://lore.kernel.org/lkml/20210609113903.1421-1-vbabka@suse.cz/ [6] https://lore.kernel.org/lkml/891dc24e38106f8542f4c72831d52dc1a1863ae8.camel@gmx.de [7] https://lore.kernel.org/linux-rt-users/87tul5p2fa.ffs@nanos.tec.linutronix.de/ [8] https://lore.kernel.org/lkml/20210729132132.19691-1-vbabka@suse.cz/ [9] https://lore.kernel.org/lkml/20210804120522.GD6464@techsingularity.net/ This patch (of 35: slab_debug_trace_open() can only be called on caches with SLAB_STORE_USER flag and as with all slub debugging flags, such caches avoid cpu or percpu partial slabs altogether, so there's nothing to flush. Link: https://lkml.kernel.org/r/20210805152000.12817-1-vbabka@suse.cz Link: https://lkml.kernel.org/r/20210805152000.12817-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Mel Gorman Cc: Jesper Dangaard Brouer Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f77d8cd79ef7f..f6063ec97a554 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5825,9 +5825,6 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) return -ENOMEM; - /* Push back cpu slabs */ - flush_all(s); - for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; From 05afc1c2dc86a66c3a73a623707431f534b17e81 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:54 +1000 Subject: [PATCH 013/365] mm, slub: allocate private object map for debugfs listings Slub has a static spinlock protected bitmap for marking which objects are on freelist when it wants to list them, for situations where dynamically allocating such map can lead to recursion or locking issues, and on-stack bitmap would be too large. The handlers of debugfs files alloc_traces and free_traces also currently use this shared bitmap, but their syscall context makes it straightforward to allocate a private map before entering locked sections, so switch these processing paths to use a private bitmap. Link: https://lkml.kernel.org/r/20210805152000.12817-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Mel Gorman Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f6063ec97a554..fb603fdf58cbc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -454,6 +454,18 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); +static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, + struct page *page) +{ + void *addr = page_address(page); + void *p; + + bitmap_zero(obj_map, page->objects); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(__obj_to_index(s, addr, p), obj_map); +} + #if IS_ENABLED(CONFIG_KUNIT) static bool slab_add_kunit_errors(void) { @@ -483,17 +495,11 @@ static inline bool slab_add_kunit_errors(void) { return false; } static unsigned long *get_map(struct kmem_cache *s, struct page *page) __acquires(&object_map_lock) { - void *p; - void *addr = page_address(page); - VM_BUG_ON(!irqs_disabled()); spin_lock(&object_map_lock); - bitmap_zero(object_map, page->objects); - - for (p = page->freelist; p; p = get_freepointer(s, p)) - set_bit(__obj_to_index(s, addr, p), object_map); + __fill_map(object_map, s, page); return object_map; } @@ -4879,17 +4885,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + unsigned long *obj_map) { void *addr = page_address(page); void *p; - unsigned long *map; - map = get_map(s, page); + __fill_map(obj_map, s, page); + for_each_object(p, s, addr, page->objects) - if (!test_bit(__obj_to_index(s, addr, p), map)) + if (!test_bit(__obj_to_index(s, addr, p), obj_map)) add_location(t, s, get_track(s, p, alloc)); - put_map(map); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ @@ -5816,14 +5822,21 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, sizeof(struct loc_track)); struct kmem_cache *s = file_inode(filep)->i_private; + unsigned long *obj_map; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) + return -ENOMEM; if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) alloc = TRACK_ALLOC; else alloc = TRACK_FREE; - if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) + if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { + bitmap_free(obj_map); return -ENOMEM; + } for_each_kmem_cache_node(s, node, n) { unsigned long flags; @@ -5834,12 +5847,13 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) - process_slab(t, s, page, alloc); + process_slab(t, s, page, alloc, obj_map); list_for_each_entry(page, &n->full, slab_list) - process_slab(t, s, page, alloc); + process_slab(t, s, page, alloc, obj_map); spin_unlock_irqrestore(&n->list_lock, flags); } + bitmap_free(obj_map); return 0; } From 0b058c7e7169c3690707280bda463cf8a720f2d8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:54 +1000 Subject: [PATCH 014/365] mm, slub: allocate private object map for validate_slab_cache() validate_slab_cache() is called either to handle a sysfs write, or from a self-test context. In both situations it's straightforward to preallocate a private object bitmap instead of grabbing the shared static one meant for critical sections, so let's do that. Link: https://lkml.kernel.org/r/20210805152000.12817-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Mel Gorman Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fb603fdf58cbc..4697280130f21 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4679,11 +4679,11 @@ static int count_total(struct page *page) #endif #ifdef CONFIG_SLUB_DEBUG -static void validate_slab(struct kmem_cache *s, struct page *page) +static void validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *obj_map) { void *p; void *addr = page_address(page); - unsigned long *map; slab_lock(page); @@ -4691,21 +4691,20 @@ static void validate_slab(struct kmem_cache *s, struct page *page) goto unlock; /* Now we know that a valid freelist exists */ - map = get_map(s, page); + __fill_map(obj_map, s, page); for_each_object(p, s, addr, page->objects) { - u8 val = test_bit(__obj_to_index(s, addr, p), map) ? + u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ? SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; if (!check_object(s, page, p, val)) break; } - put_map(map); unlock: slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, - struct kmem_cache_node *n) + struct kmem_cache_node *n, unsigned long *obj_map) { unsigned long count = 0; struct page *page; @@ -4714,7 +4713,7 @@ static int validate_slab_node(struct kmem_cache *s, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) { - validate_slab(s, page); + validate_slab(s, page, obj_map); count++; } if (count != n->nr_partial) { @@ -4727,7 +4726,7 @@ static int validate_slab_node(struct kmem_cache *s, goto out; list_for_each_entry(page, &n->full, slab_list) { - validate_slab(s, page); + validate_slab(s, page, obj_map); count++; } if (count != atomic_long_read(&n->nr_slabs)) { @@ -4746,10 +4745,17 @@ long validate_slab_cache(struct kmem_cache *s) int node; unsigned long count = 0; struct kmem_cache_node *n; + unsigned long *obj_map; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) + return -ENOMEM; flush_all(s); for_each_kmem_cache_node(s, node, n) - count += validate_slab_node(s, n); + count += validate_slab_node(s, n, obj_map); + + bitmap_free(obj_map); return count; } From ece120a019809984db4867a97297730b51f3dd93 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:54 +1000 Subject: [PATCH 015/365] mm, slub: don't disable irq for debug_check_no_locks_freed() In slab_free_hook() we disable irqs around the debug_check_no_locks_freed() call, which is unnecessary, as irqs are already being disabled inside the call. This seems to be leftover from the past where there were more calls inside the irq disabled sections. Remove the irq disable/enable operations. Mel noted: > Looks like it was needed for kmemcheck which went away back in 4.15 Link: https://lkml.kernel.org/r/20210805152000.12817-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 4697280130f21..fee093db2bfd7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1591,20 +1591,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, { kmemleak_free_recursive(x, s->flags); - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#ifdef CONFIG_LOCKDEP - { - unsigned long flags; + debug_check_no_locks_freed(x, s->object_size); - local_irq_save(flags); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); From 932dab59a0e9741a03a151d7d03e3eaf674c90df Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:54 +1000 Subject: [PATCH 016/365] mm, slub: remove redundant unfreeze_partials() from put_cpu_partial() Commit d6e0b7fa1186 ("slub: make dead caches discard free slabs immediately") introduced cpu partial flushing for kmemcg caches, based on setting the target cpu_partial to 0 and adding a flushing check in put_cpu_partial(). This code that sets cpu_partial to 0 was later moved by c9fc586403e7 ("slab: introduce __kmemcg_cache_deactivate()") and ultimately removed by 9855609bde03 ("mm: memcg/slab: use a single set of kmem_caches for all accounted allocations"). However the check and flush in put_cpu_partial() was never removed, although it's effectively a dead code. So this patch removes it. Note that d6e0b7fa1186 also added preempt_disable()/enable() to unfreeze_partials() which could be thus also considered unnecessary. But further patches will rely on it, so keep it. Link: https://lkml.kernel.org/r/20210805152000.12817-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fee093db2bfd7..79e53303844c3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2466,13 +2466,6 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - if (unlikely(!slub_cpu_partial(s))) { - unsigned long flags; - - local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - local_irq_restore(flags); - } preempt_enable(); #endif /* CONFIG_SLUB_CPU_PARTIAL */ } From 2625ffdf3276758c6e50bb713a46c6e6aa84953f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:55 +1000 Subject: [PATCH 017/365] mm, slub: unify cmpxchg_double_slab() and __cmpxchg_double_slab() These functions differ only in irq disabling in the slow path. We can create a common function with an extra bool parameter to control the irq disabling. As the functions are inline and the parameter compile-time constant, there will be no runtime overhead due to this change. Also change the DEBUG_VM based irqs disable assert to the more standard lockdep_assert based one. Link: https://lkml.kernel.org/r/20210805152000.12817-7-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 62 +++++++++++++++++++++---------------------------------- 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 79e53303844c3..e1c4e934c6203 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -371,13 +371,13 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -/* Interrupts must be disabled (for the fallback code to work right) */ -static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, +static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, - const char *n) + const char *n, bool disable_irqs) { - VM_BUG_ON(!irqs_disabled()); + if (!disable_irqs) + lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { @@ -388,15 +388,23 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page } else #endif { + unsigned long flags; + + if (disable_irqs) + local_irq_save(flags); slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; slab_unlock(page); + if (disable_irqs) + local_irq_restore(flags); return true; } slab_unlock(page); + if (disable_irqs) + local_irq_restore(flags); } cpu_relax(); @@ -409,45 +417,23 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page return false; } -static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return true; - } else -#endif - { - unsigned long flags; - - local_irq_save(flags); - slab_lock(page); - if (page->freelist == freelist_old && - page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; - slab_unlock(page); - local_irq_restore(flags); - return true; - } - slab_unlock(page); - local_irq_restore(flags); - } - - cpu_relax(); - stat(s, CMPXCHG_DOUBLE_FAIL); - -#ifdef SLUB_DEBUG_CMPXCHG - pr_info("%s %s: cmpxchg double redo ", n, s->name); -#endif + return ___cmpxchg_double_slab(s, page, freelist_old, counters_old, + freelist_new, counters_new, n, false); +} - return false; +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + return ___cmpxchg_double_slab(s, page, freelist_old, counters_old, + freelist_new, counters_new, n, true); } #ifdef CONFIG_SLUB_DEBUG From 7a92444d2898b1c7c0c3e631dbae4b823be32a41 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:55 +1000 Subject: [PATCH 018/365] mm, slub: extract get_partial() from new_slab_objects() The later patches will need more fine grained control over individual actions in ___slab_alloc(), the only caller of new_slab_objects(), so this is a first preparatory step with no functional change. This adds a goto label that appears unnecessary at this point, but will be useful for later changes. Link: https://lkml.kernel.org/r/20210805152000.12817-8-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e1c4e934c6203..75c532307897b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2599,17 +2599,12 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu **pc) { - void *freelist; + void *freelist = NULL; struct kmem_cache_cpu *c = *pc; struct page *page; WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); - freelist = get_partial(s, flags, node, c); - - if (freelist) - return freelist; - page = new_slab(s, flags, node); if (page) { c = raw_cpu_ptr(s->cpu_slab); @@ -2773,6 +2768,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto redo; } + freelist = get_partial(s, gfpflags, node, c); + if (freelist) + goto check_new_page; + freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) { @@ -2780,6 +2779,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return NULL; } +check_new_page: page = c->page; if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) goto load_freelist; From e3155f535ff5dcc0a4bcd9dc6ae9429fd8f7e668 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:55 +1000 Subject: [PATCH 019/365] mm, slub: dissolve new_slab_objects() into ___slab_alloc() The later patches will need more fine grained control over individual actions in ___slab_alloc(), the only caller of new_slab_objects(), so dissolve it there. This is a preparatory step with no functional change. The only minor change is moving WARN_ON_ONCE() for using a constructor together with __GFP_ZERO to new_slab(), which makes it somewhat less frequent, but still able to catch a development change introducing a systematic misuse. Link: https://lkml.kernel.org/r/20210805152000.12817-9-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Mel Gorman Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 50 ++++++++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 75c532307897b..c8f5b2ca1ad6e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1871,6 +1871,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + return allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } @@ -2596,36 +2598,6 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) #endif } -static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, - int node, struct kmem_cache_cpu **pc) -{ - void *freelist = NULL; - struct kmem_cache_cpu *c = *pc; - struct page *page; - - WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); - - page = new_slab(s, flags, node); - if (page) { - c = raw_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); - - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - freelist = page->freelist; - page->freelist = NULL; - - stat(s, ALLOC_SLAB); - c->page = page; - *pc = c; - } - - return freelist; -} - static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) { if (unlikely(PageSlabPfmemalloc(page))) @@ -2772,13 +2744,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (freelist) goto check_new_page; - freelist = new_slab_objects(s, gfpflags, node, &c); + page = new_slab(s, gfpflags, node); - if (unlikely(!freelist)) { + if (unlikely(!page)) { slab_out_of_memory(s, gfpflags, node); return NULL; } + c = raw_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + freelist = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->page = page; + check_new_page: page = c->page; if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) From da17192d4b24e611131cf349278aca6ecca8463f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:55 +1000 Subject: [PATCH 020/365] mm, slub: return slab page from get_partial() and set c->page afterwards The function get_partial() finds a suitable page on a partial list, acquires and returns its freelist and assigns the page pointer to kmem_cache_cpu. In later patch we will need more control over the kmem_cache_cpu.page assignment, so instead of passing a kmem_cache_cpu pointer, pass a pointer to a pointer to a page that get_partial() can fill and the caller can assign the kmem_cache_cpu.page pointer. No functional change as all of this still happens with disabled IRQs. Link: https://lkml.kernel.org/r/20210805152000.12817-10-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index c8f5b2ca1ad6e..fd9f5481b422d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2003,7 +2003,7 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct kmem_cache_cpu *c, gfp_t flags) + struct page **ret_page, gfp_t flags) { struct page *page, *page2; void *object = NULL; @@ -2032,7 +2032,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, available += objects; if (!object) { - c->page = page; + *ret_page = page; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { @@ -2052,7 +2052,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, * Get a page from somewhere. Search in increasing NUMA distances. */ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct kmem_cache_cpu *c) + struct page **ret_page) { #ifdef CONFIG_NUMA struct zonelist *zonelist; @@ -2094,7 +2094,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c, flags); + object = get_partial_node(s, n, ret_page, flags); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2116,7 +2116,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * Get a partial page, lock it and return it. */ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct kmem_cache_cpu *c) + struct page **ret_page) { void *object; int searchnode = node; @@ -2124,11 +2124,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), c, flags); + object = get_partial_node(s, get_node(s, searchnode), ret_page, flags); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, c); + return get_any_partial(s, flags, ret_page); } #ifdef CONFIG_PREEMPTION @@ -2740,9 +2740,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto redo; } - freelist = get_partial(s, gfpflags, node, c); - if (freelist) + freelist = get_partial(s, gfpflags, node, &page); + if (freelist) { + c->page = page; goto check_new_page; + } page = new_slab(s, gfpflags, node); @@ -2766,7 +2768,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c->page = page; check_new_page: - page = c->page; if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) goto load_freelist; From c9456e920885921db0ab2a14829e0bed8f13f52e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:55 +1000 Subject: [PATCH 021/365] mm, slub: restructure new page checks in ___slab_alloc() When we allocate slab object from a newly acquired page (from node's partial list or page allocator), we usually also retain the page as a new percpu slab. There are two exceptions - when pfmemalloc status of the page doesn't match our gfp flags, or when the cache has debugging enabled. The current code for these decisions is not easy to follow, so restructure it and add comments. The new structure will also help with the following changes. No functional change. Link: https://lkml.kernel.org/r/20210805152000.12817-11-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fd9f5481b422d..b13424444caef 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2768,13 +2768,29 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c->page = page; check_new_page: - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) - goto load_freelist; - /* Only entered in the debug case */ - if (kmem_cache_debug(s) && - !alloc_debug_processing(s, page, freelist, addr)) - goto new_slab; /* Slab failed checks. Next slab needed */ + if (kmem_cache_debug(s)) { + if (!alloc_debug_processing(s, page, freelist, addr)) + /* Slab failed checks. Next slab needed */ + goto new_slab; + else + /* + * For debug case, we don't load freelist so that all + * allocations go through alloc_debug_processing() + */ + goto return_single; + } + + if (unlikely(!pfmemalloc_match(page, gfpflags))) + /* + * For !pfmemalloc_match() case we don't load freelist so that + * we don't make further mismatched allocations easier. + */ + goto return_single; + + goto load_freelist; + +return_single: deactivate_slab(s, page, get_freepointer(s, freelist), c); return freelist; From 60b7e4e7ed33abfc726b7b5ceeef772da0ec5c43 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:56 +1000 Subject: [PATCH 022/365] mm, slub: simplify kmem_cache_cpu and tid setup In slab_alloc_node() and do_slab_free() fastpaths we need to guarantee that our kmem_cache_cpu pointer is from the same cpu as the tid value. Currently that's done by reading the tid first using this_cpu_read(), then the kmem_cache_cpu pointer and verifying we read the same tid using the pointer and plain READ_ONCE(). This can be simplified to just fetching kmem_cache_cpu pointer and then reading tid using the pointer. That guarantees they are from the same cpu. We don't need to read the tid using this_cpu_read() because the value will be validated by this_cpu_cmpxchg_double(), making sure we are on the correct cpu and the freelist didn't change by anyone preempting us since reading the tid. Link: https://lkml.kernel.org/r/20210805152000.12817-12-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b13424444caef..1f65d75077ceb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2868,15 +2868,14 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * We should guarantee that tid and kmem_cache are retrieved on - * the same cpu. It could be different if CONFIG_PREEMPTION so we need - * to check if it is matched or not. + * We must guarantee that tid and kmem_cache_cpu are retrieved on the + * same cpu. We read first the kmem_cache_cpu pointer and use it to read + * the tid. If we are preempted and switched to another cpu between the + * two reads, it's OK as the two are still associated with the same cpu + * and cmpxchg later will validate the cpu. */ - do { - tid = this_cpu_read(s->cpu_slab->tid); - c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPTION) && - unlikely(tid != READ_ONCE(c->tid))); + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); /* * Irqless object alloc/free algorithm used here depends on sequence @@ -3150,11 +3149,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succeed. */ - do { - tid = this_cpu_read(s->cpu_slab->tid); - c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPTION) && - unlikely(tid != READ_ONCE(c->tid))); + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); /* Same with comment on barrier() in slab_alloc_node() */ barrier(); From cc976b129501de05bd138a85d9efb7848dd5c1e2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:56 +1000 Subject: [PATCH 023/365] mm, slub: move disabling/enabling irqs to ___slab_alloc() Currently __slab_alloc() disables irqs around the whole ___slab_alloc(). This includes cases where this is not needed, such as when the allocation ends up in the page allocator and has to awkwardly enable irqs back based on gfp flags. Also the whole kmem_cache_alloc_bulk() is executed with irqs disabled even when it hits the __slab_alloc() slow path, and long periods with disabled interrupts are undesirable. As a first step towards reducing irq disabled periods, move irq handling into ___slab_alloc(). Callers will instead prevent the s->cpu_slab percpu pointer from becoming invalid via get_cpu_ptr(), thus preempt_disable(). This does not protect against modification by an irq handler, which is still done by disabled irq for most of ___slab_alloc(). As a small immediate benefit, slab_out_of_memory() from ___slab_alloc() is now called with irqs enabled. kmem_cache_alloc_bulk() disables irqs for its fastpath and then re-enables them before calling ___slab_alloc(), which then disables them at its discretion. The whole kmem_cache_alloc_bulk() operation also disables preemption. When ___slab_alloc() calls new_slab() to allocate a new page, re-enable preemption, because new_slab() will re-enable interrupts in contexts that allow blocking (this will be improved by later patches). The patch itself will thus increase overhead a bit due to disabled preemption (on configs where it matters) and increased disabling/enabling irqs in kmem_cache_alloc_bulk(), but that will be gradually improved in the following patches. Note in __slab_alloc() we need to change the #ifdef CONFIG_PREEMPT guard to CONFIG_PREEMPT_COUNT to make sure preempt disable/enable is properly paired in all configurations. On configs without involuntary preemption and debugging the re-read of kmem_cache_cpu pointer is still compiled out as it was before. [efault@gmx.de: fix kmem_cache_alloc_bulk() error path] Link: https://lkml.kernel.org/r/20210805152000.12817-13-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1f65d75077ceb..31f946e038232 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2656,7 +2656,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. * - * Version of __slab_alloc to use when we know that interrupts are + * Version of __slab_alloc to use when we know that preemption is * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, @@ -2664,9 +2664,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void *freelist; struct page *page; + unsigned long flags; stat(s, ALLOC_SLOWPATH); + local_irq_save(flags); page = c->page; if (!page) { /* @@ -2729,6 +2731,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); + local_irq_restore(flags); return freelist; new_slab: @@ -2746,14 +2749,16 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto check_new_page; } + put_cpu_ptr(s->cpu_slab); page = new_slab(s, gfpflags, node); + c = get_cpu_ptr(s->cpu_slab); if (unlikely(!page)) { + local_irq_restore(flags); slab_out_of_memory(s, gfpflags, node); return NULL; } - c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); @@ -2793,31 +2798,33 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return_single: deactivate_slab(s, page, get_freepointer(s, freelist), c); + local_irq_restore(flags); return freelist; } /* - * Another one that disabled interrupt and compensates for possible - * cpu changes by refetching the per cpu area pointer. + * A wrapper for ___slab_alloc() for contexts where preemption is not yet + * disabled. Compensates for possible cpu changes by refetching the per cpu area + * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *p; - unsigned long flags; - local_irq_save(flags); -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_COUNT /* * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area + * cpu before disabling preemption. Need to reload cpu area * pointer. */ - c = this_cpu_ptr(s->cpu_slab); + c = get_cpu_ptr(s->cpu_slab); #endif p = ___slab_alloc(s, gfpflags, node, addr, c); - local_irq_restore(flags); +#ifdef CONFIG_PREEMPT_COUNT + put_cpu_ptr(s->cpu_slab); +#endif return p; } @@ -3345,8 +3352,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * IRQs, which protects against PREEMPT and interrupts * handlers invoking normal fastpath. */ + c = get_cpu_ptr(s->cpu_slab); local_irq_disable(); - c = this_cpu_ptr(s->cpu_slab); for (i = 0; i < size; i++) { void *object = kfence_alloc(s, s->object_size, flags); @@ -3367,6 +3374,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, */ c->tid = next_tid(c->tid); + local_irq_enable(); + /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist @@ -3379,6 +3388,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); + local_irq_disable(); + continue; /* goto for-loop */ } c->freelist = get_freepointer(s, object); @@ -3387,6 +3398,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, } c->tid = next_tid(c->tid); local_irq_enable(); + put_cpu_ptr(s->cpu_slab); /* * memcg and kmem_cache debug support and memory initialization. @@ -3396,7 +3408,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s)); return i; error: - local_irq_enable(); + put_cpu_ptr(s->cpu_slab); slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; From e9c2a5f0ca9c0f8475c92aab8a350e02d0479e67 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:56 +1000 Subject: [PATCH 024/365] mm, slub: do initial checks in ___slab_alloc() with irqs enabled As another step of shortening irq disabled sections in ___slab_alloc(), delay disabling irqs until we pass the initial checks if there is a cached percpu slab and it's suitable for our allocation. Now we have to recheck c->page after actually disabling irqs as an allocation in irq handler might have replaced it. Because we call pfmemalloc_match() as one of the checks, we might hit VM_BUG_ON_PAGE(!PageSlab(page)) in PageSlabPfmemalloc in case we get interrupted and the page is freed. Thus introduce a pfmemalloc_match_unsafe() variant that lacks the PageSlab check. Link: https://lkml.kernel.org/r/20210805152000.12817-14-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 31f946e038232..d1d39135d9f99 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2668,8 +2668,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLOWPATH); - local_irq_save(flags); - page = c->page; +reread_page: + + page = READ_ONCE(c->page); if (!page) { /* * if the node is not online or has no normal memory, just @@ -2678,6 +2679,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(node != NUMA_NO_NODE && !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; + local_irq_save(flags); + if (unlikely(c->page)) { + local_irq_restore(flags); + goto reread_page; + } goto new_slab; } redo: @@ -2692,8 +2698,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto redo; } else { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist, c); - goto new_slab; + goto deactivate_slab; } } @@ -2702,12 +2707,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist, c); - goto new_slab; - } + if (unlikely(!pfmemalloc_match(page, gfpflags))) + goto deactivate_slab; - /* must check again c->freelist in case of cpu migration or IRQ */ + /* must check again c->page in case IRQ handler changed it */ + local_irq_save(flags); + if (unlikely(page != c->page)) { + local_irq_restore(flags); + goto reread_page; + } freelist = c->freelist; if (freelist) goto load_freelist; @@ -2723,6 +2731,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: + + lockdep_assert_irqs_disabled(); + /* * freelist is pointing to the list of objects to be used. * page is pointing to the page from which the objects are obtained. @@ -2734,11 +2745,23 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, local_irq_restore(flags); return freelist; +deactivate_slab: + + local_irq_save(flags); + if (page != c->page) { + local_irq_restore(flags); + goto reread_page; + } + deactivate_slab(s, page, c->freelist, c); + new_slab: + lockdep_assert_irqs_disabled(); + if (slub_percpu_partial(c)) { page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); + local_irq_restore(flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; } From 7455d22082891a0d8a797aafb92f18e98e46d6eb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:56 +1000 Subject: [PATCH 025/365] mm, slub: prevent VM_BUG_ON in PageSlabPfmemalloc from ___slab_alloc Clark Williams reported [1] a VM_BUG_ON in PageSlabPfmemalloc: page:000000009ac5dd73 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1ab3db flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 0017ffffc0000000 ffffee1286aceb88 ffffee1287b66288 0000000000000000 raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) ------------[ cut here ]------------ kernel BUG at include/linux/page-flags.h:814! invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI CPU: 3 PID: 12345 Comm: hackbench Not tainted 5.14.0-rc5-rt8+ #12 Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0359.2016.0906.1028 09/06/2016 RIP: 0010:___slab_alloc+0x340/0x940 Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f60c04bdaf8 CR3: 0000000124f3a003 CR4: 00000000003706e0 Call Trace: ? __alloc_skb+0x1db/0x270 ? __alloc_skb+0x1db/0x270 ? kmem_cache_alloc_node+0xa4/0x2b0 kmem_cache_alloc_node+0xa4/0x2b0 __alloc_skb+0x1db/0x270 alloc_skb_with_frags+0x64/0x250 sock_alloc_send_pskb+0x260/0x2b0 ? bpf_lsm_socket_getpeersec_dgram+0xa/0x10 unix_stream_sendmsg+0x27c/0x550 ? unix_seqpacket_recvmsg+0x60/0x60 sock_sendmsg+0xbd/0xd0 sock_write_iter+0xb9/0x120 new_sync_write+0x175/0x200 vfs_write+0x3c4/0x510 ksys_write+0xc9/0x110 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The problem is that we are opportunistically checking flags on a page in irq enabled section. If we are interrupted and the page is freed, it's not an issue as we detect it after disabling irqs. But on kernels with CONFIG_DEBUG_VM. The check for PageSlab flag in PageSlabPfmemalloc() can fail. Fix this by creating an "unsafe" version of the check that doesn't check PageSlab. [1] https://lore.kernel.org/lkml/20210812151803.52f84aaf@theseus.lan/ Link: https://lkml.kernel.org/r/f4756ee5-a7e9-ab02-3aba-1355f77b7c79@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Clark Williams Tested-by: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/page-flags.h | 9 +++++++++ mm/slub.c | 15 ++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6e..7fda4fb85bdca 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -815,6 +815,15 @@ static inline int PageSlabPfmemalloc(struct page *page) return PageActive(page); } +/* + * A version of PageSlabPfmemalloc() for opportunistic checks where the page + * might have been freed under us and not be a PageSlab anymore. + */ +static inline int __PageSlabPfmemalloc(struct page *page) +{ + return PageActive(page); +} + static inline void SetPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); diff --git a/mm/slub.c b/mm/slub.c index d1d39135d9f99..38458e3478103 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2606,6 +2606,19 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) return true; } +/* + * A variant of pfmemalloc_match() that tests page flags without asserting + * PageSlab. Intended for opportunistic checks before taking a lock and + * rechecking that nobody else freed the page under us. + */ +static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) +{ + if (unlikely(__PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + /* * Check the page->freelist of a page and either transfer the freelist to the * per cpu freelist or deactivate the page. @@ -2707,7 +2720,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(page, gfpflags))) + if (unlikely(!try_pfmemalloc_match(page, gfpflags))) goto deactivate_slab; /* must check again c->page in case IRQ handler changed it */ From dfdb1b514bf800cc64c72fe577b63f7799697b84 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:57 +1000 Subject: [PATCH 026/365] mm-slub-do-initial-checks-in-___slab_alloc-with-irqs-enabled-fix-fix fix renaming snafu Link: https://lkml.kernel.org/r/ec98bce0-fef4-0fbc-2067-e358510e0321@suse.cz Signed-off-by: Vlastimil Babka Cc: Clark Williams Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 38458e3478103..fcc38638c645b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2720,7 +2720,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!try_pfmemalloc_match(page, gfpflags))) + if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) goto deactivate_slab; /* must check again c->page in case IRQ handler changed it */ From d1e19a041272049a7668567ad955ddc12982282b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:57 +1000 Subject: [PATCH 027/365] mm, slub: move disabling irqs closer to get_partial() in ___slab_alloc() Continue reducing the irq disabled scope. Check for per-cpu partial slabs with first with irqs enabled and then recheck with irqs disabled before grabbing the slab page. Mostly preparatory for the following patches. Link: https://lkml.kernel.org/r/20210805152000.12817-15-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fcc38638c645b..a437730d7ae24 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2692,11 +2692,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(node != NUMA_NO_NODE && !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; - local_irq_save(flags); - if (unlikely(c->page)) { - local_irq_restore(flags); - goto reread_page; - } goto new_slab; } redo: @@ -2737,6 +2732,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!freelist) { c->page = NULL; + local_irq_restore(flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2766,12 +2762,19 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto reread_page; } deactivate_slab(s, page, c->freelist, c); + local_irq_restore(flags); new_slab: - lockdep_assert_irqs_disabled(); - if (slub_percpu_partial(c)) { + local_irq_save(flags); + if (unlikely(c->page)) { + local_irq_restore(flags); + goto reread_page; + } + if (unlikely(!slub_percpu_partial(c))) + goto new_objects; /* stolen by an IRQ handler */ + page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); local_irq_restore(flags); @@ -2779,6 +2782,16 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto redo; } + local_irq_save(flags); + if (unlikely(c->page)) { + local_irq_restore(flags); + goto reread_page; + } + +new_objects: + + lockdep_assert_irqs_disabled(); + freelist = get_partial(s, gfpflags, node, &page); if (freelist) { c->page = page; @@ -2811,15 +2824,18 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, check_new_page: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, page, freelist, addr)) + if (!alloc_debug_processing(s, page, freelist, addr)) { /* Slab failed checks. Next slab needed */ + c->page = NULL; + local_irq_restore(flags); goto new_slab; - else + } else { /* * For debug case, we don't load freelist so that all * allocations go through alloc_debug_processing() */ goto return_single; + } } if (unlikely(!pfmemalloc_match(page, gfpflags))) From 941f410a941824515f4f2d64517b648f214fb959 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:57 +1000 Subject: [PATCH 028/365] mm, slub: restore irqs around calling new_slab() allocate_slab() currently re-enables irqs before calling to the page allocator. It depends on gfpflags_allow_blocking() to determine if it's safe to do so. Now we can instead simply restore irq before calling it through new_slab(). The other caller early_kmem_cache_node_alloc() is unaffected by this. Link: https://lkml.kernel.org/r/20210805152000.12817-16-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a437730d7ae24..8b240eb10a1d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1795,9 +1795,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (gfpflags_allow_blocking(flags)) - local_irq_enable(); - flags |= s->allocflags; /* @@ -1856,8 +1853,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (gfpflags_allow_blocking(flags)) - local_irq_disable(); if (!page) return NULL; @@ -2798,16 +2793,17 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto check_new_page; } + local_irq_restore(flags); put_cpu_ptr(s->cpu_slab); page = new_slab(s, gfpflags, node); c = get_cpu_ptr(s->cpu_slab); if (unlikely(!page)) { - local_irq_restore(flags); slab_out_of_memory(s, gfpflags, node); return NULL; } + local_irq_save(flags); if (c->page) flush_slab(s, c); From 36b1fc32ce132d6962c9aa5da297c48775936708 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:58 +1000 Subject: [PATCH 029/365] mm, slub: validate slab from partial list or page allocator before making it cpu slab When we obtain a new slab page from node partial list or page allocator, we assign it to kmem_cache_cpu, perform some checks, and if they fail, we undo the assignment. In order to allow doing the checks without irq disabled, restructure the code so that the checks are done first, and kmem_cache_cpu.page assignment only after they pass. Link: https://lkml.kernel.org/r/20210805152000.12817-17-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 8b240eb10a1d2..fddfb0629b8ee 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2788,10 +2788,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, lockdep_assert_irqs_disabled(); freelist = get_partial(s, gfpflags, node, &page); - if (freelist) { - c->page = page; + if (freelist) goto check_new_page; - } local_irq_restore(flags); put_cpu_ptr(s->cpu_slab); @@ -2804,9 +2802,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, } local_irq_save(flags); - if (c->page) - flush_slab(s, c); - /* * No other reference to the page yet so we can * muck around with it freely without cmpxchg @@ -2815,14 +2810,12 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, page->freelist = NULL; stat(s, ALLOC_SLAB); - c->page = page; check_new_page: if (kmem_cache_debug(s)) { if (!alloc_debug_processing(s, page, freelist, addr)) { /* Slab failed checks. Next slab needed */ - c->page = NULL; local_irq_restore(flags); goto new_slab; } else { @@ -2841,10 +2834,18 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ goto return_single; + if (unlikely(c->page)) + flush_slab(s, c); + c->page = page; + goto load_freelist; return_single: + if (unlikely(c->page)) + flush_slab(s, c); + c->page = page; + deactivate_slab(s, page, get_freepointer(s, freelist), c); local_irq_restore(flags); return freelist; From 1567f70ea4187aa68b3ba3f9ff0b3dac47b01eb6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:58 +1000 Subject: [PATCH 030/365] mm, slub: check new pages with restored irqs Building on top of the previous patch, re-enable irqs before checking new pages. alloc_debug_processing() is now called with enabled irqs so we need to remove VM_BUG_ON(!irqs_disabled()); in check_slab() - there doesn't seem to be a need for it anyway. Link: https://lkml.kernel.org/r/20210805152000.12817-18-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fddfb0629b8ee..d5ac278084cfb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -995,8 +995,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) { int maxobj; - VM_BUG_ON(!irqs_disabled()); - if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; @@ -2788,10 +2786,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, lockdep_assert_irqs_disabled(); freelist = get_partial(s, gfpflags, node, &page); + local_irq_restore(flags); if (freelist) goto check_new_page; - local_irq_restore(flags); put_cpu_ptr(s->cpu_slab); page = new_slab(s, gfpflags, node); c = get_cpu_ptr(s->cpu_slab); @@ -2801,7 +2799,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return NULL; } - local_irq_save(flags); /* * No other reference to the page yet so we can * muck around with it freely without cmpxchg @@ -2816,7 +2813,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (kmem_cache_debug(s)) { if (!alloc_debug_processing(s, page, freelist, addr)) { /* Slab failed checks. Next slab needed */ - local_irq_restore(flags); goto new_slab; } else { /* @@ -2834,6 +2830,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ goto return_single; + local_irq_save(flags); if (unlikely(c->page)) flush_slab(s, c); c->page = page; @@ -2842,6 +2839,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return_single: + local_irq_save(flags); if (unlikely(c->page)) flush_slab(s, c); c->page = page; From c66f238aa17b8f2880ddf12d9df591a326f9f010 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:58 +1000 Subject: [PATCH 031/365] mm, slub: stop disabling irqs around get_partial() The function get_partial() does not need to have irqs disabled as a whole. It's sufficient to convert spin_lock operations to their irq saving/restoring versions. As a result, it's now possible to reach the page allocator from the slab allocator without disabling and re-enabling interrupts on the way. Link: https://lkml.kernel.org/r/20210805152000.12817-19-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d5ac278084cfb..ad2c58a31af5a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1996,11 +1996,12 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct page **ret_page, gfp_t flags) + struct page **ret_page, gfp_t gfpflags) { struct page *page, *page2; void *object = NULL; unsigned int available = 0; + unsigned long flags; int objects; /* @@ -2012,11 +2013,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!n || !n->nr_partial) return NULL; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(page, flags)) + if (!pfmemalloc_match(page, gfpflags)) continue; t = acquire_slab(s, n, page, object == NULL, &objects); @@ -2037,7 +2038,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; } - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); return object; } @@ -2765,8 +2766,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, local_irq_restore(flags); goto reread_page; } - if (unlikely(!slub_percpu_partial(c))) + if (unlikely(!slub_percpu_partial(c))) { + local_irq_restore(flags); goto new_objects; /* stolen by an IRQ handler */ + } page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); @@ -2775,18 +2778,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto redo; } - local_irq_save(flags); - if (unlikely(c->page)) { - local_irq_restore(flags); - goto reread_page; - } - new_objects: - lockdep_assert_irqs_disabled(); - freelist = get_partial(s, gfpflags, node, &page); - local_irq_restore(flags); if (freelist) goto check_new_page; From 66d2a24680a1db9a5edd2374a07c747feaa6f014 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:58 +1000 Subject: [PATCH 032/365] mm, slub: move reset of c->page and freelist out of deactivate_slab() deactivate_slab() removes the cpu slab by merging the cpu freelist with slab's freelist and putting the slab on the proper node's list. It also sets the respective kmem_cache_cpu pointers to NULL. By extracting the kmem_cache_cpu operations from the function, we can make it not dependent on disabled irqs. Also if we return a single free pointer from ___slab_alloc, we no longer have to assign kmem_cache_cpu.page before deactivation or care if somebody preempted us and assigned a different page to our kmem_cache_cpu in the process. Link: https://lkml.kernel.org/r/20210805152000.12817-20-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ad2c58a31af5a..ef36c94f27e2b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2195,10 +2195,13 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) } /* - * Remove the cpu slab + * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist, + * unfreezes the slabs and puts it on the proper list. + * Assumes the slab has been already safely taken away from kmem_cache_cpu + * by the caller. */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist, struct kmem_cache_cpu *c) + void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2327,9 +2330,6 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, discard_slab(s, page); stat(s, FREE_SLAB); } - - c->page = NULL; - c->freelist = NULL; } /* @@ -2454,10 +2454,16 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist, c); + void *freelist = c->freelist; + struct page *page = c->page; + c->page = NULL; + c->freelist = NULL; c->tid = next_tid(c->tid); + + deactivate_slab(s, page, freelist); + + stat(s, CPUSLAB_FLUSH); } /* @@ -2755,7 +2761,10 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, local_irq_restore(flags); goto reread_page; } - deactivate_slab(s, page, c->freelist, c); + freelist = c->freelist; + c->page = NULL; + c->freelist = NULL; + deactivate_slab(s, page, freelist); local_irq_restore(flags); new_slab: @@ -2834,11 +2843,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return_single: local_irq_save(flags); - if (unlikely(c->page)) - flush_slab(s, c); - c->page = page; - - deactivate_slab(s, page, get_freepointer(s, freelist), c); + deactivate_slab(s, page, get_freepointer(s, freelist)); local_irq_restore(flags); return freelist; } From 25e4f9ada2f653b9aab60393aeda83d4f07684d1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:59 +1000 Subject: [PATCH 033/365] mm, slub: make locking in deactivate_slab() irq-safe dectivate_slab() now no longer touches the kmem_cache_cpu structure, so it will be possible to call it with irqs enabled. Just convert the spin_lock calls to their irq saving/restoring variants to make it irq-safe. Note we now have to use cmpxchg_double_slab() for irq-safe slab_lock(), because in some situations we don't take the list_lock, which would disable irqs. Link: https://lkml.kernel.org/r/20210805152000.12817-21-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ef36c94f27e2b..00a96e9030e50 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2209,6 +2209,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, enum slab_modes l = M_NONE, m = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; + unsigned long flags = 0; struct page new; struct page old; @@ -2284,7 +2285,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * that acquire_slab() will see a slab page that * is frozen */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } } else { m = M_FULL; @@ -2295,7 +2296,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * slabs from diagnostic functions will not see * any frozen slabs. */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } } @@ -2312,14 +2313,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, } l = m; - if (!__cmpxchg_double_slab(s, page, + if (!cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) goto redo; if (lock) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); if (m == M_PARTIAL) stat(s, tail); From a7d52f9726e76a8f0f40f1b6d3798414f2f71e2b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:59 +1000 Subject: [PATCH 034/365] mm, slub: call deactivate_slab() without disabling irqs The function is now safe to be called with irqs enabled, so move the calls outside of irq disabled sections. When called from ___slab_alloc() -> flush_slab() we have irqs disabled, so to reenable them before deactivate_slab() we need to open-code flush_slab() in ___slab_alloc() and reenable irqs after modifying the kmem_cache_cpu fields. But that means a IRQ handler meanwhile might have assigned a new page to kmem_cache_cpu.page so we have to retry the whole check. The remaining callers of flush_slab() are the IPI handler which has disabled irqs anyway, and slub_cpu_dead() which will be dealt with in the following patch. Link: https://lkml.kernel.org/r/20210805152000.12817-22-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 00a96e9030e50..44c4de4b37551 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2765,8 +2765,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, freelist = c->freelist; c->page = NULL; c->freelist = NULL; - deactivate_slab(s, page, freelist); local_irq_restore(flags); + deactivate_slab(s, page, freelist); new_slab: @@ -2834,18 +2834,32 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ goto return_single; +retry_load_page: + local_irq_save(flags); - if (unlikely(c->page)) - flush_slab(s, c); + if (unlikely(c->page)) { + void *flush_freelist = c->freelist; + struct page *flush_page = c->page; + + c->page = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + local_irq_restore(flags); + + deactivate_slab(s, flush_page, flush_freelist); + + stat(s, CPUSLAB_FLUSH); + + goto retry_load_page; + } c->page = page; goto load_freelist; return_single: - local_irq_save(flags); deactivate_slab(s, page, get_freepointer(s, freelist)); - local_irq_restore(flags); return freelist; } From 84a3db0c52be61c2ca4d3943ae8ab627fc2da24b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:58:59 +1000 Subject: [PATCH 035/365] mm, slub: move irq control into unfreeze_partials() unfreeze_partials() can be optimized so that it doesn't need irqs disabled for the whole time. As the first step, move irq control into the function and remove it from the put_cpu_partial() caller. Link: https://lkml.kernel.org/r/20210805152000.12817-23-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 44c4de4b37551..b1c120d3d461a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2336,9 +2336,8 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, /* * Unfreeze all the cpu partial slabs. * - * This function must be called with interrupts disabled - * for the cpu using c (or some other guarantee must be there - * to guarantee no concurrent accesses). + * This function must be called with preemption or migration + * disabled with c local to the cpu. */ static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2346,6 +2345,9 @@ static void unfreeze_partials(struct kmem_cache *s, #ifdef CONFIG_SLUB_CPU_PARTIAL struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; + unsigned long flags; + + local_irq_save(flags); while ((page = slub_percpu_partial(c))) { struct page new; @@ -2398,6 +2400,8 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } + + local_irq_restore(flags); #endif /* CONFIG_SLUB_CPU_PARTIAL */ } @@ -2425,14 +2429,11 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) pobjects = oldpage->pobjects; pages = oldpage->pages; if (drain && pobjects > slub_cpu_partial(s)) { - unsigned long flags; /* * partial array is full. Move the existing * set to the per node partial list. */ - local_irq_save(flags); unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - local_irq_restore(flags); oldpage = NULL; pobjects = 0; pages = 0; From c686ea1bcf84512d555f6aafbaf54631b023574c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:00 +1000 Subject: [PATCH 036/365] mm, slub: discard slabs in unfreeze_partials() without irqs disabled No need for disabled irqs when discarding slabs, so restore them before discarding. Link: https://lkml.kernel.org/r/20210805152000.12817-24-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index b1c120d3d461a..240b22328212b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2392,6 +2392,8 @@ static void unfreeze_partials(struct kmem_cache *s, if (n) spin_unlock(&n->list_lock); + local_irq_restore(flags); + while (discard_page) { page = discard_page; discard_page = discard_page->next; @@ -2401,7 +2403,6 @@ static void unfreeze_partials(struct kmem_cache *s, stat(s, FREE_SLAB); } - local_irq_restore(flags); #endif /* CONFIG_SLUB_CPU_PARTIAL */ } From c9c73fec8db3c827d0d0d4c1587ef180f18b9412 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:00 +1000 Subject: [PATCH 037/365] mm, slub: detach whole partial list at once in unfreeze_partials() Instead of iterating through the live percpu partial list, detach it from the kmem_cache_cpu at once. This is simpler and will allow further optimization. Link: https://lkml.kernel.org/r/20210805152000.12817-25-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 240b22328212b..d8bfc41dc1f0f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2344,16 +2344,20 @@ static void unfreeze_partials(struct kmem_cache *s, { #ifdef CONFIG_SLUB_CPU_PARTIAL struct kmem_cache_node *n = NULL, *n2 = NULL; - struct page *page, *discard_page = NULL; + struct page *page, *partial_page, *discard_page = NULL; unsigned long flags; local_irq_save(flags); - while ((page = slub_percpu_partial(c))) { + partial_page = slub_percpu_partial(c); + c->partial = NULL; + + while (partial_page) { struct page new; struct page old; - slub_set_percpu_partial(c, page); + page = partial_page; + partial_page = page->next; n2 = get_node(s, page_to_nid(page)); if (n != n2) { From c60f86b7f775cdf7603175681609e1a9bc387c3a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:00 +1000 Subject: [PATCH 038/365] mm, slub: separate detaching of partial list in unfreeze_partials() from unfreezing Unfreezing partial list can be split to two phases - detaching the list from struct kmem_cache_cpu, and processing the list. The whole operation does not need to be protected by disabled irqs. Restructure the code to separate the detaching (with disabled irqs) and unfreezing (with irq disabling to be reduced in the next patch). Also, unfreeze_partials() can be called from another cpu on behalf of a cpu that is being offlined, where disabling irqs on the local cpu has no sense, so restructure the code as follows: - __unfreeze_partials() is the bulk of unfreeze_partials() that processes the detached percpu partial list - unfreeze_partials() detaches list from current cpu with irqs disabled and calls __unfreeze_partials() - unfreeze_partials_cpu() is to be called for the offlined cpu so it needs no irq disabling, and is called from __flush_cpu_slab() - flush_cpu_slab() is for the local cpu thus it needs to call unfreeze_partials(). So it can't simply call __flush_cpu_slab(smp_processor_id()) anymore and we have to open-code the proper calls. Link: https://lkml.kernel.org/r/20210805152000.12817-26-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 73 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d8bfc41dc1f0f..5f57e36e93ef9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2333,25 +2333,15 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, } } -/* - * Unfreeze all the cpu partial slabs. - * - * This function must be called with preemption or migration - * disabled with c local to the cpu. - */ -static void unfreeze_partials(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ #ifdef CONFIG_SLUB_CPU_PARTIAL +static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) +{ struct kmem_cache_node *n = NULL, *n2 = NULL; - struct page *page, *partial_page, *discard_page = NULL; + struct page *page, *discard_page = NULL; unsigned long flags; local_irq_save(flags); - partial_page = slub_percpu_partial(c); - c->partial = NULL; - while (partial_page) { struct page new; struct page old; @@ -2406,10 +2396,45 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } +} -#endif /* CONFIG_SLUB_CPU_PARTIAL */ +/* + * Unfreeze all the cpu partial slabs. + */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct page *partial_page; + unsigned long flags; + + local_irq_save(flags); + partial_page = this_cpu_read(s->cpu_slab->partial); + this_cpu_write(s->cpu_slab->partial, NULL); + local_irq_restore(flags); + + if (partial_page) + __unfreeze_partials(s, partial_page); } +static void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + struct page *partial_page; + + partial_page = slub_percpu_partial(c); + c->partial = NULL; + + if (partial_page) + __unfreeze_partials(s, partial_page); +} + +#else /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void unfreeze_partials(struct kmem_cache *s) { } +static inline void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } + +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + /* * Put a page that was just frozen (in __slab_free|get_partial_node) into a * partial page slot if available. @@ -2438,7 +2463,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) * partial array is full. Move the existing * set to the per node partial list. */ - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + unfreeze_partials(s); oldpage = NULL; pobjects = 0; pages = 0; @@ -2473,11 +2498,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) stat(s, CPUSLAB_FLUSH); } -/* - * Flush cpu slab. - * - * Called from IPI handler with interrupts disabled. - */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); @@ -2485,14 +2505,23 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) if (c->page) flush_slab(s, c); - unfreeze_partials(s, c); + unfreeze_partials_cpu(s, c); } +/* + * Flush cpu slab. + * + * Called from IPI handler with interrupts disabled. + */ static void flush_cpu_slab(void *d) { struct kmem_cache *s = d; + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - __flush_cpu_slab(s, smp_processor_id()); + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s); } static bool has_cpu_slab(int cpu, void *info) From 09b2bd32eff32a5dc6156565445a232f02182aa0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:00 +1000 Subject: [PATCH 039/365] mm, slub: only disable irq with spin_lock in __unfreeze_partials() __unfreeze_partials() no longer needs to have irqs disabled, except for making the spin_lock operations irq-safe, so convert the spin_locks operations and remove the separate irq handling. Link: https://lkml.kernel.org/r/20210805152000.12817-27-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5f57e36e93ef9..fc09ec3563b4d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2338,9 +2338,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) { struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; - unsigned long flags; - - local_irq_save(flags); + unsigned long flags = 0; while (partial_page) { struct page new; @@ -2352,10 +2350,10 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) n2 = get_node(s, page_to_nid(page)); if (n != n2) { if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); n = n2; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } do { @@ -2384,9 +2382,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) } if (n) - spin_unlock(&n->list_lock); - - local_irq_restore(flags); + spin_unlock_irqrestore(&n->list_lock, flags); while (discard_page) { page = discard_page; From 1e571e786dd50492d99586f8bbc48dfa86b7a08d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:01 +1000 Subject: [PATCH 040/365] mm, slub: don't disable irqs in slub_cpu_dead() slub_cpu_dead() cleans up for an offlined cpu from another cpu and calls only functions that are now irq safe, so we don't need to disable irqs anymore. Link: https://lkml.kernel.org/r/20210805152000.12817-28-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fc09ec3563b4d..ddcaddc5e8017 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2540,14 +2540,10 @@ static void flush_all(struct kmem_cache *s) static int slub_cpu_dead(unsigned int cpu) { struct kmem_cache *s; - unsigned long flags; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - local_irq_save(flags); + list_for_each_entry(s, &slab_caches, list) __flush_cpu_slab(s, cpu); - local_irq_restore(flags); - } mutex_unlock(&slab_mutex); return 0; } From 71699265dd7f1fe3ab36d19bbae4cb47da4f2736 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:01 +1000 Subject: [PATCH 041/365] mm, slab: make flush_slab() possible to call with irqs enabled Currently flush_slab() is always called with disabled IRQs if it's needed, but the following patches will change that, so add a parameter to control IRQ disabling within the function, which only protects the kmem_cache_cpu manipulation and not the call to deactivate_slab() which doesn't need it. Link: https://lkml.kernel.org/r/20210805152000.12817-29-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ddcaddc5e8017..bb7e055437389 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2480,16 +2480,28 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) #endif /* CONFIG_SLUB_CPU_PARTIAL */ } -static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, + bool lock) { - void *freelist = c->freelist; - struct page *page = c->page; + unsigned long flags; + void *freelist; + struct page *page; + + if (lock) + local_irq_save(flags); + + freelist = c->freelist; + page = c->page; c->page = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - deactivate_slab(s, page, freelist); + if (lock) + local_irq_restore(flags); + + if (page) + deactivate_slab(s, page, freelist); stat(s, CPUSLAB_FLUSH); } @@ -2499,7 +2511,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (c->page) - flush_slab(s, c); + flush_slab(s, c, false); unfreeze_partials_cpu(s, c); } @@ -2515,7 +2527,7 @@ static void flush_cpu_slab(void *d) struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); if (c->page) - flush_slab(s, c); + flush_slab(s, c, false); unfreeze_partials(s); } From 36600d98972e9d7eb1a8c1ce1acad423ada2b20e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 24 Aug 2021 09:59:01 +1000 Subject: [PATCH 042/365] mm: slub: move flush_cpu_slab() invocations __free_slab() invocations out of IRQ context flush_all() flushes a specific SLAB cache on each CPU (where the cache is present). The deactivate_slab()/__free_slab() invocation happens within IPI handler and is problematic for PREEMPT_RT. The flush operation is not a frequent operation or a hot path. The per-CPU flush operation can be moved to within a workqueue. [vbabka@suse.cz: adapt to new SLUB changes] Link: https://lkml.kernel.org/r/20210805152000.12817-30-vbabka@suse.cz Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index bb7e055437389..0da7074f4c9a4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2516,33 +2516,73 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) unfreeze_partials_cpu(s, c); } +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + /* * Flush cpu slab. * - * Called from IPI handler with interrupts disabled. + * Called from CPU work handler with migration disabled. */ -static void flush_cpu_slab(void *d) +static void flush_cpu_slab(struct work_struct *w) { - struct kmem_cache *s = d; - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + struct kmem_cache *s; + struct kmem_cache_cpu *c; + struct slub_flush_work *sfw; + + sfw = container_of(w, struct slub_flush_work, work); + + s = sfw->s; + c = this_cpu_ptr(s->cpu_slab); if (c->page) - flush_slab(s, c, false); + flush_slab(s, c, true); unfreeze_partials(s); } -static bool has_cpu_slab(int cpu, void *info) +static bool has_cpu_slab(int cpu, struct kmem_cache *s) { - struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); return c->page || slub_percpu_partial(c); } +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); + static void flush_all(struct kmem_cache *s) { - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); + struct slub_flush_work *sfw; + unsigned int cpu; + + mutex_lock(&flush_lock); + cpus_read_lock(); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (!has_cpu_slab(cpu, s)) { + sfw->skip = true; + continue; + } + INIT_WORK(&sfw->work, flush_cpu_slab); + sfw->skip = false; + sfw->s = s; + schedule_work_on(cpu, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (sfw->skip) + continue; + flush_work(&sfw->work); + } + + cpus_read_unlock(); + mutex_unlock(&flush_lock); } /* From cc2ef29c3d30a886348e79941ef093a463157473 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:01 +1000 Subject: [PATCH 043/365] mm, slub: fix memory and cpu hotplug related lock ordering issues Qian Cai reported [1] a lockdep splat on memory offline. [ 91.374541] WARNING: possible circular locking dependency detected [ 91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted [ 91.387149] ------------------------------------------------------ [ 91.394016] lsbug/1523 is trying to acquire lock: [ 91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8 [ 91.407425] but task is already holding lock: [ 91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280 [ 91.423603] which lock already depends on the new lock. To fix it, we need to change the order in flush_all() so that cpus_read_lock() is first and mutex_lock(&flush_lock) second. Also when called from slab_mem_going_offline_callback() we are already under cpus_read_lock() and cannot take it again, so create a flush_all_cpus_locked() variant and decouple flushing from actual shrinking for this call path. Additionally, Mike Galbraith reported [2] wrong order of cpus_read_lock() and slab_mutex in kmem_cache_destroy() path and proposed a fix to reverse it. This patch is a fixup for the mmotm patch mm-slub-move-flush_cpu_slab-invocations-__free_slab-invocations-out-of-irq-context.patch [1] https://lore.kernel.org/lkml/0b36128c-3e12-77df-85fe-a153a714569b@quicinc.com/ [2] https://lore.kernel.org/lkml/2eb3cf340716c40f03a0a342ab40219b3d1de195.camel@gmx.de/ Link: https://lkml.kernel.org/r/50fe26ba-450b-af57-506d-438f67cfbce3@suse.cz Reported-by: Qian Cai Reported-by: Mike Galbraith Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slab_common.c | 2 ++ mm/slub.c | 29 +++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 1c673c323baf2..ec2bb0beed757 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; + cpus_read_lock(); mutex_lock(&slab_mutex); s->refcount--; @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cache *s) } out_unlock: mutex_unlock(&slab_mutex); + cpus_read_unlock(); } EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/mm/slub.c b/mm/slub.c index 0da7074f4c9a4..5795e423483b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2554,13 +2554,13 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) static DEFINE_MUTEX(flush_lock); static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); -static void flush_all(struct kmem_cache *s) +static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; unsigned int cpu; + lockdep_assert_cpus_held(); mutex_lock(&flush_lock); - cpus_read_lock(); for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); @@ -2581,10 +2581,16 @@ static void flush_all(struct kmem_cache *s) flush_work(&sfw->work); } - cpus_read_unlock(); mutex_unlock(&flush_lock); } +static void flush_all(struct kmem_cache *s) +{ + cpus_read_lock(); + flush_all_cpus_locked(s); + cpus_read_unlock(); +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -4127,7 +4133,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) int node; struct kmem_cache_node *n; - flush_all(s); + flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { free_partial(s, n); @@ -4403,7 +4409,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_do_shrink(struct kmem_cache *s) { int node; int i; @@ -4415,7 +4421,6 @@ int __kmem_cache_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) @@ -4465,13 +4470,21 @@ int __kmem_cache_shrink(struct kmem_cache *s) return ret; } +int __kmem_cache_shrink(struct kmem_cache *s) +{ + flush_all(s); + return __kmem_cache_do_shrink(s); +} + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + list_for_each_entry(s, &slab_caches, list) { + flush_all_cpus_locked(s); + __kmem_cache_do_shrink(s); + } mutex_unlock(&slab_mutex); return 0; From d2acb8e8d240643dc6c4684b40cb687dbd3c9590 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:01 +1000 Subject: [PATCH 044/365] mm, slub: fix memory and cpu hotplug related lock ordering issues - fix Make __kmem_cache_do_shrink static to silence "no previous prototype" warning. Link: https://lkml.kernel.org/r/60e50d8c-ccfa-1036-5a03-2b1a9d25f958@suse.cz Signed-off-by: Vlastimil Babka Reported-by: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 5795e423483b3..29d71ae20c90c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4409,7 +4409,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_do_shrink(struct kmem_cache *s) +static int __kmem_cache_do_shrink(struct kmem_cache *s) { int node; int i; From 12429452d98acb9421335aa34d54a62e186498a0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 24 Aug 2021 09:59:02 +1000 Subject: [PATCH 045/365] mm: slub: make object_map_lock a raw_spinlock_t The variable object_map is protected by object_map_lock. The lock is always acquired in debug code and within already atomic context Make object_map_lock a raw_spinlock_t. Link: https://lkml.kernel.org/r/20210805152000.12817-31-vbabka@suse.cz Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 29d71ae20c90c..b4e0efbcb7092 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -438,7 +438,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_SPINLOCK(object_map_lock); +static DEFINE_RAW_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct page *page) @@ -483,7 +483,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) { VM_BUG_ON(!irqs_disabled()); - spin_lock(&object_map_lock); + raw_spin_lock(&object_map_lock); __fill_map(object_map, s, page); @@ -493,7 +493,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); - spin_unlock(&object_map_lock); + raw_spin_unlock(&object_map_lock); } static inline unsigned int size_from_object(struct kmem_cache *s) From 6f62eab387d6f2f814d39d32a8f2ee15b0493af9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:02 +1000 Subject: [PATCH 046/365] mm, slub: optionally save/restore irqs in slab_[un]lock()/ For PREEMPT_RT we will need to disable irqs for this bit spinlock. As a preparation, add a flags parameter, and an internal version that takes additional bool parameter to control irq saving/restoring (the flags parameter is compile-time unused if the bool is a constant false). Convert ___cmpxchg_double_slab(), which also comes with the same bool parameter, to use the internal version. Link: https://lkml.kernel.org/r/20210805152000.12817-32-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 52 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b4e0efbcb7092..59722a4bb5468 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -359,16 +359,33 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) /* * Per slab locking using the pagelock */ -static __always_inline void slab_lock(struct page *page) +static __always_inline void +__slab_lock(struct page *page, unsigned long *flags, bool disable_irqs) { VM_BUG_ON_PAGE(PageTail(page), page); + if (disable_irqs) + local_irq_save(*flags); bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void slab_unlock(struct page *page) +static __always_inline void +__slab_unlock(struct page *page, unsigned long *flags, bool disable_irqs) { VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); + if (disable_irqs) + local_irq_restore(*flags); +} + +static __always_inline void +slab_lock(struct page *page, unsigned long *flags) +{ + __slab_lock(page, flags, false); +} + +static __always_inline void slab_unlock(struct page *page, unsigned long *flags) +{ + __slab_unlock(page, flags, false); } static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page, @@ -388,23 +405,18 @@ static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *pag } else #endif { - unsigned long flags; + /* init to 0 to prevent spurious warnings */ + unsigned long flags = 0; - if (disable_irqs) - local_irq_save(flags); - slab_lock(page); + __slab_lock(page, &flags, disable_irqs); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; - slab_unlock(page); - if (disable_irqs) - local_irq_restore(flags); + __slab_unlock(page, &flags, disable_irqs); return true; } - slab_unlock(page); - if (disable_irqs) - local_irq_restore(flags); + __slab_unlock(page, &flags, disable_irqs); } cpu_relax(); @@ -1255,11 +1267,11 @@ static noinline int free_debug_processing( struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long flags; + unsigned long flags, flags2; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); - slab_lock(page); + slab_lock(page, &flags2); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, page)) @@ -1292,7 +1304,7 @@ static noinline int free_debug_processing( slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); - slab_unlock(page); + slab_unlock(page, &flags2); spin_unlock_irqrestore(&n->list_lock, flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); @@ -4070,9 +4082,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, void *addr = page_address(page); unsigned long *map; void *p; + unsigned long flags; slab_err(s, page, text, s->name); - slab_lock(page); + slab_lock(page, &flags); map = get_map(s, page); for_each_object(p, s, addr, page->objects) { @@ -4083,7 +4096,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } put_map(map); - slab_unlock(page); + slab_unlock(page, &flags); #endif } @@ -4815,8 +4828,9 @@ static void validate_slab(struct kmem_cache *s, struct page *page, { void *p; void *addr = page_address(page); + unsigned long flags; - slab_lock(page); + slab_lock(page, &flags); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) goto unlock; @@ -4831,7 +4845,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page, break; } unlock: - slab_unlock(page); + slab_unlock(page, &flags); } static int validate_slab_node(struct kmem_cache *s, From 368fe299c2f5a9d459936b932c82a28e23feb288 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:02 +1000 Subject: [PATCH 047/365] mm, slub: make slab_lock() disable irqs with PREEMPT_RT We need to disable irqs around slab_lock() (a bit spinlock) to make it irq-safe. The calls to slab_lock() are nested under spin_lock_irqsave() which doesn't disable irqs on PREEMPT_RT, so add explicit disabling with PREEMPT_RT. We also distinguish cmpxchg_double_slab() where we do the disabling explicitly and __cmpxchg_double_slab() for contexts with already disabled irqs. However these context are also typically spin_lock_irqsave() thus insufficient on PREEMPT_RT. Thus, change __cmpxchg_double_slab() to be same as cmpxchg_double_slab() on PREEMPT_RT. Link: https://lkml.kernel.org/r/20210805152000.12817-33-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 59722a4bb5468..3286defe4857d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -380,12 +380,12 @@ __slab_unlock(struct page *page, unsigned long *flags, bool disable_irqs) static __always_inline void slab_lock(struct page *page, unsigned long *flags) { - __slab_lock(page, flags, false); + __slab_lock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT)); } static __always_inline void slab_unlock(struct page *page, unsigned long *flags) { - __slab_unlock(page, flags, false); + __slab_unlock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT)); } static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page, @@ -429,14 +429,19 @@ static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *pag return false; } -/* Interrupts must be disabled (for the fallback code to work right) */ +/* + * Interrupts must be disabled (for the fallback code to work right), typically + * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different + * so we disable interrupts explicitly here. + */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { return ___cmpxchg_double_slab(s, page, freelist_old, counters_old, - freelist_new, counters_new, n, false); + freelist_new, counters_new, n, + IS_ENABLED(CONFIG_PREEMPT_RT)); } static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, From 181314aade9e8cb57ea05532a4a109bef12906a9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:02 +1000 Subject: [PATCH 048/365] mm, slub: protect put_cpu_partial() with disabled irqs instead of cmpxchg Jann Horn reported [1] the following theoretically possible race: task A: put_cpu_partial() calls preempt_disable() task A: oldpage = this_cpu_read(s->cpu_slab->partial) interrupt: kfree() reaches unfreeze_partials() and discards the page task B (on another CPU): reallocates page as page cache task A: reads page->pages and page->pobjects, which are actually halves of the pointer page->lru.prev task B (on another CPU): frees page interrupt: allocates page as SLUB page and places it on the percpu partial list task A: this_cpu_cmpxchg() succeeds which would cause page->pages and page->pobjects to end up containing halves of pointers that would then influence when put_cpu_partial() happens and show up in root-only sysfs files. Maybe that's acceptable, I don't know. But there should probably at least be a comment for now to point out that we're reading union fields of a page that might be in a completely different state. Additionally, the this_cpu_cmpxchg() approach in put_cpu_partial() is only safe against s->cpu_slab->partial manipulation in ___slab_alloc() if the latter disables irqs, otherwise a __slab_free() in an irq handler could call put_cpu_partial() in the middle of ___slab_alloc() manipulating ->partial and corrupt it. This becomes an issue on RT after a local_lock is introduced in later patch. The fix means taking the local_lock also in put_cpu_partial() on RT. After debugging this issue, Mike Galbraith suggested [2] that to avoid different locking schemes on RT and !RT, we can just protect put_cpu_partial() with disabled irqs (to be converted to local_lock_irqsave() later) everywhere. This should be acceptable as it's not a fast path, and moving the actual partial unfreezing outside of the irq disabled section makes it short, and with the retry loop gone the code can be also simplified. In addition, the race reported by Jann should no longer be possible. [1] https://lore.kernel.org/lkml/CAG48ez1mvUuXwg0YPH5ANzhQLpbphqk-ZS+jbRz+H66fvm4FcA@mail.gmail.com/ [2] https://lore.kernel.org/linux-rt-users/e3470ab357b48bccfbd1f5133b982178a7d2befb.camel@gmx.de/ Link: https://lkml.kernel.org/r/20210805152000.12817-34-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Jann Horn Suggested-by: Mike Galbraith Cc: Christoph Lameter Cc: David Rientjes Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 81 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3286defe4857d..2362efab53df4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2006,7 +2006,12 @@ static inline void *acquire_slab(struct kmem_cache *s, return freelist; } +#ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +#else +static inline void put_cpu_partial(struct kmem_cache *s, struct page *page, + int drain) { } +#endif static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* @@ -2440,14 +2445,6 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, __unfreeze_partials(s, partial_page); } -#else /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void unfreeze_partials(struct kmem_cache *s) { } -static inline void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } - -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - /* * Put a page that was just frozen (in __slab_free|get_partial_node) into a * partial page slot if available. @@ -2457,46 +2454,56 @@ static inline void unfreeze_partials_cpu(struct kmem_cache *s, */ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { -#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *oldpage; - int pages; - int pobjects; + struct page *page_to_unfreeze = NULL; + unsigned long flags; + int pages = 0; + int pobjects = 0; - preempt_disable(); - do { - pages = 0; - pobjects = 0; - oldpage = this_cpu_read(s->cpu_slab->partial); + local_irq_save(flags); + + oldpage = this_cpu_read(s->cpu_slab->partial); - if (oldpage) { + if (oldpage) { + if (drain && oldpage->pobjects > slub_cpu_partial(s)) { + /* + * Partial array is full. Move the existing set to the + * per node partial list. Postpone the actual unfreezing + * outside of the critical section. + */ + page_to_unfreeze = oldpage; + oldpage = NULL; + } else { pobjects = oldpage->pobjects; pages = oldpage->pages; - if (drain && pobjects > slub_cpu_partial(s)) { - /* - * partial array is full. Move the existing - * set to the per node partial list. - */ - unfreeze_partials(s); - oldpage = NULL; - pobjects = 0; - pages = 0; - stat(s, CPU_PARTIAL_DRAIN); - } } + } - pages++; - pobjects += page->objects - page->inuse; + pages++; + pobjects += page->objects - page->inuse; - page->pages = pages; - page->pobjects = pobjects; - page->next = oldpage; + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) - != oldpage); - preempt_enable(); -#endif /* CONFIG_SLUB_CPU_PARTIAL */ + this_cpu_write(s->cpu_slab->partial, page); + + local_irq_restore(flags); + + if (page_to_unfreeze) { + __unfreeze_partials(s, page_to_unfreeze); + stat(s, CPU_PARTIAL_DRAIN); + } } +#else /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void unfreeze_partials(struct kmem_cache *s) { } +static inline void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } + +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, bool lock) { From 546a2c420cfa3d08a562f4cc1568fe892ab2fa7b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:02 +1000 Subject: [PATCH 049/365] mm, slub: use migrate_disable() on PREEMPT_RT We currently use preempt_disable() (directly or via get_cpu_ptr()) to stabilize the pointer to kmem_cache_cpu. On PREEMPT_RT this would be incompatible with the list_lock spinlock. We can use migrate_disable() instead, but that increases overhead on !PREEMPT_RT as it's an unconditional function call. In order to get the best available mechanism on both PREEMPT_RT and !PREEMPT_RT, introduce private slub_get_cpu_ptr() and slub_put_cpu_ptr() wrappers and use them. Link: https://lkml.kernel.org/r/20210805152000.12817-35-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Mike Galbraith Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 2362efab53df4..cd78b7844273c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -118,6 +118,26 @@ * the fast path and disables lockless freelists. */ +/* + * We could simply use migrate_disable()/enable() but as long as it's a + * function call even on !PREEMPT_RT, use inline preempt_disable() there. + */ +#ifndef CONFIG_PREEMPT_RT +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#else +#define slub_get_cpu_ptr(var) \ +({ \ + migrate_disable(); \ + this_cpu_ptr(var); \ +}) +#define slub_put_cpu_ptr(var) \ +do { \ + (void)(var); \ + migrate_enable(); \ +} while (0) +#endif + #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -2828,7 +2848,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) goto deactivate_slab; - /* must check again c->page in case IRQ handler changed it */ + /* must check again c->page in case we got preempted and it changed */ local_irq_save(flags); if (unlikely(page != c->page)) { local_irq_restore(flags); @@ -2887,7 +2907,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, } if (unlikely(!slub_percpu_partial(c))) { local_irq_restore(flags); - goto new_objects; /* stolen by an IRQ handler */ + /* we were preempted and partial list got empty */ + goto new_objects; } page = c->page = slub_percpu_partial(c); @@ -2903,9 +2924,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (freelist) goto check_new_page; - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); page = new_slab(s, gfpflags, node); - c = get_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!page)) { slab_out_of_memory(s, gfpflags, node); @@ -2988,12 +3009,12 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * cpu before disabling preemption. Need to reload cpu area * pointer. */ - c = get_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); #endif p = ___slab_alloc(s, gfpflags, node, addr, c); #ifdef CONFIG_PREEMPT_COUNT - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); #endif return p; } @@ -3522,7 +3543,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * IRQs, which protects against PREEMPT and interrupts * handlers invoking normal fastpath. */ - c = get_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); local_irq_disable(); for (i = 0; i < size; i++) { @@ -3568,7 +3589,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, } c->tid = next_tid(c->tid); local_irq_enable(); - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); /* * memcg and kmem_cache debug support and memory initialization. @@ -3578,7 +3599,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s)); return i; error: - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; From f9ae4c7df502f03d6d68da10458aa0a6ed7713f4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:03 +1000 Subject: [PATCH 050/365] mm, slub: convert kmem_cpu_slab protection to local_lock Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's equivalent, with better lockdep visibility. On PREEMPT_RT that means better preemption. However, the cost on PREEMPT_RT is the loss of lockless fast paths which only work with cpu freelist. Those are designed to detect and recover from being preempted by other conflicting operations (both fast or slow path), but the slow path operations assume they cannot be preempted by a fast path operation, which is guaranteed naturally with disabled irqs. With local locks on PREEMPT_RT, the fast paths now also need to take the local lock to avoid races. In the allocation fastpath slab_alloc_node() we can just defer to the slowpath __slab_alloc() which also works with cpu freelist, but under the local lock. In the free fastpath do_slab_free() we have to add a new local lock protected version of freeing to the cpu freelist, as the existing slowpath only works with the page freelist. Also update the comment about locking scheme in SLUB to reflect changes done by this series. [efault@gmx.de: use local_lock() without irq in PREEMPT_RT scope, debugging of RT crashes resulting in put_cpu_partial() locking changes] Link: https://lkml.kernel.org/r/20210805152000.12817-36-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/slub_def.h | 2 + mm/slub.c | 146 ++++++++++++++++++++++++++++++--------- 2 files changed, 115 insertions(+), 33 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index dcde82a4434ca..b5bcac29b979c 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,6 +10,7 @@ #include #include #include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -41,6 +42,7 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { + local_lock_t lock; /* Protects the fields below except stat */ void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ diff --git a/mm/slub.c b/mm/slub.c index cd78b7844273c..be57687062aa1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -46,13 +46,21 @@ /* * Lock order: * 1. slab_mutex (Global Mutex) - * 2. node->list_lock - * 3. slab_lock(page) (Only on some arches and for debugging) + * 2. node->list_lock (Spinlock) + * 3. kmem_cache->cpu_slab->lock (Local lock) + * 4. slab_lock(page) (Only on some arches or for debugging) + * 5. object_map_lock (Only for debugging) * * slab_mutex * * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. + * Also synchronizes memory hotplug callbacks. + * + * slab_lock + * + * The slab_lock is a wrapper around the page lock, thus it is a bit + * spinlock. * * The slab_lock is only used for debugging and on arches that do not * have the ability to do a cmpxchg_double. It only protects: @@ -61,6 +69,8 @@ * C. page->objects -> Number of objects in page * D. page->frozen -> frozen state * + * Frozen slabs + * * If a slab is frozen then it is exempt from list management. It is not * on any list except per cpu partial list. The processor that froze the * slab is the one who can perform list operations on the page. Other @@ -68,6 +78,8 @@ * froze the slab is the only one that can retrieve the objects from the * page's freelist. * + * list_lock + * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or * removed from the lists nor make the number of partial slabs be modified. @@ -79,10 +91,36 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * Interrupts are disabled during allocation and deallocation in order to - * make the slab allocator safe to use in the context of an irq. In addition - * interrupts are disabled to ensure that the processor does not change - * while handling per_cpu slabs, due to kernel preemption. + * + * cpu_slab->lock local lock + * + * This locks protect slowpath manipulation of all kmem_cache_cpu fields + * except the stat counters. This is a percpu structure manipulated only by + * the local cpu, so the lock protects against being preempted or interrupted + * by an irq. Fast path operations rely on lockless operations instead. + * On PREEMPT_RT, the local lock does not actually disable irqs (and thus + * prevent the lockless operations), so fastpath operations also need to take + * the lock and are no longer lockless. + * + * lockless fastpaths + * + * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) + * are fully lockless when satisfied from the percpu slab (and when + * cmpxchg_double is possible to use, otherwise slab_lock is taken). + * They also don't disable preemption or migration or irqs. They rely on + * the transaction id (tid) field to detect being preempted or moved to + * another cpu. + * + * irq, preemption, migration considerations + * + * Interrupts are disabled as part of list_lock or local_lock operations, or + * around the slab_lock operation, in order to make the slab allocator safe + * to use in the context of an irq. + * + * In addition, preemption (or migration on PREEMPT_RT) is disabled in the + * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the + * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer + * doesn't have to be revalidated in each section protected by the local lock. * * SLUB assigns one slab for allocation to each processor. * Allocations only occur from these slabs called cpu slabs. @@ -2231,9 +2269,13 @@ static inline void note_cmpxchg_failure(const char *n, static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; + struct kmem_cache_cpu *c; - for_each_possible_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(s->cpu_slab, cpu); + local_lock_init(&c->lock); + c->tid = init_tid(cpu); + } } /* @@ -2444,10 +2486,10 @@ static void unfreeze_partials(struct kmem_cache *s) struct page *partial_page; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); partial_page = this_cpu_read(s->cpu_slab->partial); this_cpu_write(s->cpu_slab->partial, NULL); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (partial_page) __unfreeze_partials(s, partial_page); @@ -2480,7 +2522,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages = 0; int pobjects = 0; - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); oldpage = this_cpu_read(s->cpu_slab->partial); @@ -2508,7 +2550,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) this_cpu_write(s->cpu_slab->partial, page); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (page_to_unfreeze) { __unfreeze_partials(s, page_to_unfreeze); @@ -2532,7 +2574,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, struct page *page; if (lock) - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); freelist = c->freelist; page = c->page; @@ -2542,7 +2584,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, c->tid = next_tid(c->tid); if (lock) - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (page) deactivate_slab(s, page, freelist); @@ -2849,9 +2891,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto deactivate_slab; /* must check again c->page in case we got preempted and it changed */ - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(page != c->page)) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } freelist = c->freelist; @@ -2862,7 +2904,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!freelist) { c->page = NULL; - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2871,7 +2913,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, load_freelist: - lockdep_assert_irqs_disabled(); +#ifdef CONFIG_PREEMPT_RT + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock)); +#else + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); +#endif /* * freelist is pointing to the list of objects to be used. @@ -2881,39 +2927,39 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); return freelist; deactivate_slab: - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (page != c->page) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } freelist = c->freelist; c->page = NULL; c->freelist = NULL; - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, page, freelist); new_slab: if (slub_percpu_partial(c)) { - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->page)) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } if (unlikely(!slub_percpu_partial(c))) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); /* we were preempted and partial list got empty */ goto new_objects; } page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; } @@ -2966,7 +3012,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, retry_load_page: - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->page)) { void *flush_freelist = c->freelist; struct page *flush_page = c->page; @@ -2975,7 +3021,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c->freelist = NULL; c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, flush_page, flush_freelist); @@ -3094,7 +3140,15 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, object = c->freelist; page = c->page; - if (unlikely(!object || !page || !node_match(page, node))) { + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if a + * slowpath has taken the local_lock_irqsave(), it is not protected + * against a fast path operation in an irq handler. So we need to take + * the slow path which uses local_lock. It is still relatively fast if + * there is a suitable cpu freelist. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || + unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); @@ -3354,6 +3408,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, barrier(); if (likely(page == c->page)) { +#ifndef CONFIG_PREEMPT_RT void **freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3366,6 +3421,31 @@ static __always_inline void do_slab_free(struct kmem_cache *s, note_cmpxchg_failure("slab_free", s, tid); goto redo; } +#else /* CONFIG_PREEMPT_RT */ + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if + * a slowpath has taken the local_lock_irqsave(), it is not + * protected against a fast path operation in an irq handler. So + * we need to take the local_lock. We shouldn't simply defer to + * __slab_free() as that wouldn't use the cpu freelist at all. + */ + void **freelist; + + local_lock(&s->cpu_slab->lock); + c = this_cpu_ptr(s->cpu_slab); + if (unlikely(page != c->page)) { + local_unlock(&s->cpu_slab->lock); + goto redo; + } + tid = c->tid; + freelist = c->freelist; + + set_freepointer(s, tail_obj, freelist); + c->freelist = head; + c->tid = next_tid(tid); + + local_unlock(&s->cpu_slab->lock); +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, head, tail_obj, cnt, addr); @@ -3544,7 +3624,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * handlers invoking normal fastpath. */ c = slub_get_cpu_ptr(s->cpu_slab); - local_irq_disable(); + local_lock_irq(&s->cpu_slab->lock); for (i = 0; i < size; i++) { void *object = kfence_alloc(s, s->object_size, flags); @@ -3565,7 +3645,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, */ c->tid = next_tid(c->tid); - local_irq_enable(); + local_unlock_irq(&s->cpu_slab->lock); /* * Invoking slow path likely have side-effect @@ -3579,7 +3659,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - local_irq_disable(); + local_lock_irq(&s->cpu_slab->lock); continue; /* goto for-loop */ } @@ -3588,7 +3668,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); - local_irq_enable(); + local_unlock_irq(&s->cpu_slab->lock); slub_put_cpu_ptr(s->cpu_slab); /* From 23da11ca6231b87d5256cd9d58602f23c1b45d9f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:03 +1000 Subject: [PATCH 051/365] mm, slab: simplify lockdep_assert_held in lockdep_assert_held() Sebastian reports [1] that the special version of lockdep_assert_held() for a local lock with PREEMPT_RT is no longer necessary, and we can simplify. [1] https://lore.kernel.org/linux-mm/20210817153937.hxnuh7mqp6vuiyws@linutronix.de/ This is a fixup for mmotm patch mm-slub-convert-kmem_cpu_slab-protection-to-local_lock.patch Link: https://lkml.kernel.org/r/7e9ccf34-57d1-786b-2dfd-3b9ba78e1b32@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/slub.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index be57687062aa1..df1ac8aff86fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2913,11 +2913,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, load_freelist: -#ifdef CONFIG_PREEMPT_RT - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock)); -#else lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); -#endif /* * freelist is pointing to the list of objects to be used. From 7450c64446d3d00a7d7831aa3cb3df2df09090b5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:03 +1000 Subject: [PATCH 052/365] mm, slub: fix kmem_cache_cpu fields alignment for double cmpxchg Sven Eckelmann reports [1] that the addition of local_lock to kmem_cache_cpu breaks a config with 64BIT+LOCK_STAT: general protection fault, maybe for address 0xffff888007fcf1c8: 0000 [#1] NOPTI CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc5+ #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 RIP: 0010:kmem_cache_alloc+0x81/0x180 Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943 RSP: 0000:ffffffff81803c10 EFLAGS: 00000286 RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024 RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190 RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0 R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0 R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100 FS: 0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0 Call Trace: __get_vm_area_node.constprop.0.isra.0+0x74/0x150 __vmalloc_node_range+0x5a/0x2b0 ? kernel_clone+0x88/0x390 ? copy_process+0x1ac/0x17e0 copy_process+0x768/0x17e0 ? kernel_clone+0x88/0x390 kernel_clone+0x88/0x390 ? _vm_unmap_aliases.part.0+0xe9/0x110 ? change_page_attr_set_clr+0x10d/0x180 kernel_thread+0x43/0x50 ? rest_init+0x100/0x100 rest_init+0x1e/0x100 arch_call_rest_init+0x9/0xc start_kernel+0x481/0x493 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x80/0x84 secondary_startup_64_no_verify+0xc2/0xcb random: get_random_bytes called from oops_exit+0x34/0x60 with crng_init=0 ---[ end trace 2cac18ac38f640c1 ]--- RIP: 0010:kmem_cache_alloc+0x81/0x180 Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943 RSP: 0000:ffffffff81803c10 EFLAGS: 00000286 RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024 RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190 RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0 R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0 R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100 FS: 0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0 Kernel panic - not syncing: Attempted to kill the idle task! ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]--- Decoding the RIP points to this_cpu_cmpxchg_double() call in slab_alloc_node(). The problem is the particular size of local_lock_t with LOCK_STAT resulting in the following layout: struct kmem_cache_cpu { local_lock_t lock; /* 0 56 */ void * * freelist; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ long unsigned int tid; /* 64 8 */ struct page * page; /* 72 8 */ struct page * partial; /* 80 8 */ /* size: 88, cachelines: 2, members: 5 */ /* last cacheline: 24 bytes */ }; As pointed out by Sebastian Andrzej Siewior, this_cpu_cmpxchg_double() needs the freelist and tid fields to be aligned to sum of their sizes (16 bytes) but they are not in this configuration. This didn't happen with non-debug RT and !RT configs as well as lockdep. To fix this, move the lock field below partial field, so that it doesn't affect the layout. [1] https://lore.kernel.org/linux-mm/2666777.vCjUEy5FO1@sven-desktop/ This is a fixup for mmotm patch mm-slub-convert-kmem_cpu_slab-protection-to-local_lock.patch Link: https://lkml.kernel.org/r/e907c2b6-6df1-8038-8c6c-aa9c1fd11259@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Sven Eckelmann Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/slub_def.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index b5bcac29b979c..85499f0586b06 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -41,14 +41,18 @@ enum stat_item { CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS }; +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ struct kmem_cache_cpu { - local_lock_t lock; /* Protects the fields below except stat */ void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct page *partial; /* Partially allocated frozen slabs */ #endif + local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif From d06c4cbc8975b4821ec9865a75fde792eebf6327 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:03 +1000 Subject: [PATCH 053/365] mm/debug_vm_pgtable: introduce struct pgtable_debug_args Patch series "mm/debug_vm_pgtable: Enhancements", v6. There are a couple of issues with current implementations and this series tries to resolve the issues: (a) All needed information are scattered in variables, passed to various test functions. The code is organized in pretty much relaxed fashion. (b) The page isn't allocated from buddy during page table entry modifying tests. The page can be invalid, conflicting to the implementations of set_xxx_at() on ARM64. The target page is accessed so that the iCache can be flushed when execution permission is given on ARM64. Besides, the target page can be unmapped and accessing to it causes kernel crash. "struct pgtable_debug_args" is introduced to address issue (a). For issue (b), the used page is allocated from buddy in page table entry modifying tests. The corresponding tets will be skipped if we fail to allocate the (huge) page. For other test cases, the original page around to kernel symbol (@start_kernel) is still used. The patches are organized as below. PATCH[2-10] could be combined to one patch, but it will make the review harder: PATCH[1] introduces "struct pgtable_debug_args" as place holder of all needed information. With it, the old and new implementation can coexist. PATCH[2-10] uses "struct pgtable_debug_args" in various test functions. PATCH[11] removes the unused code for old implementation. PATCH[12] fixes the issue of corrupted page flag for ARM64 This patch (of 6): In debug_vm_pgtable(), there are many local variables introduced to track the needed information and they are passed to the functions for various test cases. It'd better to introduce a struct as place holder for these information. With it, what the tests functions need is the struct. In this way, the code is simplified and easier to be maintained. Besides, set_xxx_at() could access the data on the corresponding pages in the page table modifying tests. So the accessed pages in the tests should have been allocated from buddy. Otherwise, we're accessing pages that aren't owned by us. This causes issues like page flag corruption or kernel crash on accessing unmapped page when CONFIG_DEBUG_PAGEALLOC is enabled. This introduces "struct pgtable_debug_args". The struct is initialized and destroyed, but the information in the struct isn't used yet. It will be used in subsequent patches. Link: https://lkml.kernel.org/r/20210809092631.1888748-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20210809092631.1888748-2-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Anshuman Khandual Cc: Aneesh Kumar K.V Cc: Qian Cai Cc: Catalin Marinas Cc: Will Deacon Cc: Vineet Gupta Cc: Chunyu Hu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 270 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 269 insertions(+), 1 deletion(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1c922691aa616..7b6bcf59e3766 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -58,6 +58,37 @@ #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) +struct pgtable_debug_args { + struct mm_struct *mm; + struct vm_area_struct *vma; + + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + p4d_t *start_p4dp; + pud_t *start_pudp; + pmd_t *start_pmdp; + pgtable_t start_ptep; + + unsigned long vaddr; + pgprot_t page_prot; + pgprot_t page_prot_none; + + bool is_contiguous_page; + unsigned long pud_pfn; + unsigned long pmd_pfn; + unsigned long pte_pfn; + + unsigned long fixed_pgd_pfn; + unsigned long fixed_p4d_pfn; + unsigned long fixed_pud_pfn; + unsigned long fixed_pmd_pfn; + unsigned long fixed_pte_pfn; +}; + static void __init pte_basic_tests(unsigned long pfn, int idx) { pgprot_t prot = protection_map[idx]; @@ -955,8 +986,239 @@ static unsigned long __init get_random_vaddr(void) return random_vaddr; } +static void __init destroy_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + + /* Free (huge) page */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage() && + args->pud_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pud_pfn, + (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); + } else { + page = pfn_to_page(args->pud_pfn); + __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); + } + + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage() && + args->pmd_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); + } else { + page = pfn_to_page(args->pmd_pfn); + __free_pages(page, HPAGE_PMD_ORDER); + } + + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } + + if (args->pte_pfn != ULONG_MAX) { + page = pfn_to_page(args->pte_pfn); + __free_pages(page, 0); + + args->pte_pfn = ULONG_MAX; + } + + /* Free page table entries */ + if (args->start_ptep) { + pte_free(args->mm, args->start_ptep); + mm_dec_nr_ptes(args->mm); + } + + if (args->start_pmdp) { + pmd_free(args->mm, args->start_pmdp); + mm_dec_nr_pmds(args->mm); + } + + if (args->start_pudp) { + pud_free(args->mm, args->start_pudp); + mm_dec_nr_puds(args->mm); + } + + if (args->start_p4dp) + p4d_free(args->mm, args->start_p4dp); + + /* Free vma and mm struct */ + if (args->vma) + vm_area_free(args->vma); + + if (args->mm) + mmdrop(args->mm); +} + +static struct page * __init +debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) +{ + struct page *page = NULL; + +#ifdef CONFIG_CONTIG_ALLOC + if (order >= MAX_ORDER) { + page = alloc_contig_pages((1 << order), GFP_KERNEL, + first_online_node, NULL); + if (page) { + args->is_contiguous_page = true; + return page; + } + } +#endif + + if (order < MAX_ORDER) + page = alloc_pages(GFP_KERNEL, order); + + return page; +} + +static int __init init_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + phys_addr_t phys; + int ret = 0; + + /* + * Initialize the debugging data. + * + * __P000 (or even __S000) will help create page table entries with + * PROT_NONE permission as required for pxx_protnone_tests(). + */ + memset(args, 0, sizeof(*args)); + args->vaddr = get_random_vaddr(); + args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot_none = __P000; + args->is_contiguous_page = false; + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + args->fixed_pgd_pfn = ULONG_MAX; + args->fixed_p4d_pfn = ULONG_MAX; + args->fixed_pud_pfn = ULONG_MAX; + args->fixed_pmd_pfn = ULONG_MAX; + args->fixed_pte_pfn = ULONG_MAX; + + /* Allocate mm and vma */ + args->mm = mm_alloc(); + if (!args->mm) { + pr_err("Failed to allocate mm struct\n"); + ret = -ENOMEM; + goto error; + } + + args->vma = vm_area_alloc(args->mm); + if (!args->vma) { + pr_err("Failed to allocate vma\n"); + ret = -ENOMEM; + goto error; + } + + /* + * Allocate page table entries. They will be modified in the tests. + * Lets save the page table entries so that they can be released + * when the tests are completed. + */ + args->pgdp = pgd_offset(args->mm, args->vaddr); + args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr); + if (!args->p4dp) { + pr_err("Failed to allocate p4d entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_p4dp = p4d_offset(args->pgdp, 0UL); + WARN_ON(!args->start_p4dp); + + args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr); + if (!args->pudp) { + pr_err("Failed to allocate pud entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pudp = pud_offset(args->p4dp, 0UL); + WARN_ON(!args->start_pudp); + + args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr); + if (!args->pmdp) { + pr_err("Failed to allocate pmd entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pmdp = pmd_offset(args->pudp, 0UL); + WARN_ON(!args->start_pmdp); + + if (pte_alloc(args->mm, args->pmdp)) { + pr_err("Failed to allocate pte entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); + WARN_ON(!args->start_ptep); + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels in some of the tests. + */ + phys = __pa_symbol(&start_kernel); + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + + /* + * Allocate (huge) pages because some of the tests need to access + * the data in the pages. The corresponding tests will be skipped + * if we fail to allocate (huge) pages. + */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, + HPAGE_PUD_SHIFT - PAGE_SHIFT); + if (page) { + args->pud_pfn = page_to_pfn(page); + args->pmd_pfn = args->pud_pfn; + args->pte_pfn = args->pud_pfn; + return 0; + } + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER); + if (page) { + args->pmd_pfn = page_to_pfn(page); + args->pte_pfn = args->pmd_pfn; + return 0; + } + } + + page = alloc_pages(GFP_KERNEL, 0); + if (page) + args->pte_pfn = page_to_pfn(page); + + return 0; + +error: + destroy_args(args); + return ret; +} + static int __init debug_vm_pgtable(void) { + struct pgtable_debug_args args; struct vm_area_struct *vma; struct mm_struct *mm; pgd_t *pgdp; @@ -970,9 +1232,13 @@ static int __init debug_vm_pgtable(void) unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned, p4d_aligned, pgd_aligned; spinlock_t *ptl = NULL; - int idx; + int idx, ret; pr_info("Validating architecture page table helpers\n"); + ret = init_args(&args); + if (ret) + return ret; + prot = vm_get_page_prot(VMFLAGS); vaddr = get_random_vaddr(); mm = mm_alloc(); @@ -1127,6 +1393,8 @@ static int __init debug_vm_pgtable(void) mm_dec_nr_pmds(mm); mm_dec_nr_ptes(mm); mmdrop(mm); + + destroy_args(&args); return 0; } late_initcall(debug_vm_pgtable); From 20c7c2980777db60e170a08e83e98453e92abbb5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:03 +1000 Subject: [PATCH 054/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in basic tests This uses struct pgtable_debug_args in the basic test functions. The unused variables @pgd_aligned and @p4d_aligned in debug_vm_pgtable() are dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-3-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 50 +++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 7b6bcf59e3766..64b5a76e0f6d5 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -89,10 +89,10 @@ struct pgtable_debug_args { unsigned long fixed_pte_pfn; }; -static void __init pte_basic_tests(unsigned long pfn, int idx) +static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, prot); unsigned long val = idx, *ptr = &val; pr_debug("Validating PTE basic (%pGv)\n", ptr); @@ -174,7 +174,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_basic_tests(unsigned long pfn, int idx) +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; unsigned long val = idx, *ptr = &val; @@ -184,7 +184,7 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx) return; pr_debug("Validating PMD basic (%pGv)\n", ptr); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, prot); /* * This test needs to be executed after the given page table entry @@ -296,7 +296,7 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; unsigned long val = idx, *ptr = &val; @@ -306,7 +306,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int return; pr_debug("Validating PUD basic (%pGv)\n", ptr); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, prot); /* * This test needs to be executed after the given page table entry @@ -327,7 +327,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud)))); WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud)))); - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; /* @@ -404,7 +404,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pud_leaf(pud)); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pud_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pudp, unsigned long pfn, unsigned long vaddr, @@ -414,8 +414,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm, static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_basic_tests(unsigned long pfn, int idx) { } -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, @@ -476,7 +476,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init p4d_basic_tests(struct pgtable_debug_args *args) { p4d_t p4d; @@ -485,7 +485,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!p4d_same(p4d, p4d)); } -static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pgd_basic_tests(struct pgtable_debug_args *args) { pgd_t pgd; @@ -890,7 +890,7 @@ static void __init swap_migration_tests(void) } #ifdef CONFIG_HUGETLB_PAGE -static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { struct page *page; pte_t pte; @@ -900,21 +900,21 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) * Accessing the page associated with the pfn is safe here, * as it was previously derived from a real kernel symbol. */ - page = pfn_to_page(pfn); - pte = mk_huge_pte(page, prot); + page = pfn_to_page(args->fixed_pmd_pfn); + pte = mk_huge_pte(page, args->page_prot); WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pte_huge(pte_mkhuge(pte))); #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ -static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1230,7 +1230,7 @@ static int __init debug_vm_pgtable(void) pgprot_t prot, protnone; phys_addr_t paddr; unsigned long vaddr, pte_aligned, pmd_aligned; - unsigned long pud_aligned, p4d_aligned, pgd_aligned; + unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1273,8 +1273,6 @@ static int __init debug_vm_pgtable(void) pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; - pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; WARN_ON(!pfn_valid(pte_aligned)); pgdp = pgd_offset(mm, vaddr); @@ -1308,9 +1306,9 @@ static int __init debug_vm_pgtable(void) * given page table entry. */ for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) { - pte_basic_tests(pte_aligned, idx); - pmd_basic_tests(pmd_aligned, idx); - pud_basic_tests(mm, pud_aligned, idx); + pte_basic_tests(&args, idx); + pmd_basic_tests(&args, idx); + pud_basic_tests(&args, idx); } /* @@ -1320,8 +1318,8 @@ static int __init debug_vm_pgtable(void) * the above iteration for now to save some test execution * time. */ - p4d_basic_tests(p4d_aligned, prot); - pgd_basic_tests(pgd_aligned, prot); + p4d_basic_tests(&args); + pgd_basic_tests(&args); pmd_leaf_tests(pmd_aligned, prot); pud_leaf_tests(pud_aligned, prot); @@ -1350,7 +1348,7 @@ static int __init debug_vm_pgtable(void) pmd_thp_tests(pmd_aligned, prot); pud_thp_tests(pud_aligned, prot); - hugetlb_basic_tests(pte_aligned, prot); + hugetlb_basic_tests(&args); /* * Page table modifying tests. They need to hold From dbdfd960e8e3bd938e98ce8d96cfa81b28218762 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:03 +1000 Subject: [PATCH 055/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in leaf and savewrite tests This uses struct pgtable_debug_args in the leaf and savewrite test functions. Link: https://lkml.kernel.org/r/20210809092631.1888748-4-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 64b5a76e0f6d5..759316143a211 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -161,9 +161,9 @@ static void __init pte_advanced_tests(struct mm_struct *mm, WARN_ON(pte_young(pte)); } -static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -262,7 +262,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } -static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -270,7 +270,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD leaf\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); /* * PMD based THP is a leaf entry. @@ -279,7 +279,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -290,7 +290,7 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); } @@ -388,7 +388,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pudp_huge_get_and_clear(mm, vaddr, pudp); } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -396,7 +396,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD leaf\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); /* * PUD based THP is a leaf entry. */ @@ -411,7 +411,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pgprot_t prot) { } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } @@ -428,9 +428,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pgprot_t prot) { } -static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -1321,11 +1321,11 @@ static int __init debug_vm_pgtable(void) p4d_basic_tests(&args); pgd_basic_tests(&args); - pmd_leaf_tests(pmd_aligned, prot); - pud_leaf_tests(pud_aligned, prot); + pmd_leaf_tests(&args); + pud_leaf_tests(&args); - pte_savedwrite_tests(pte_aligned, protnone); - pmd_savedwrite_tests(pmd_aligned, protnone); + pte_savedwrite_tests(&args); + pmd_savedwrite_tests(&args); pte_special_tests(pte_aligned, prot); pte_protnone_tests(pte_aligned, protnone); From 1cc436cc04cbba8b3bb79da4b29214a973aaba17 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:04 +1000 Subject: [PATCH 056/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in protnone and devmap tests This uses struct pgtable_debug_args in protnone and devmap test functions. After that, the unused variable @protnone in debug_vm_pgtable() is dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-5-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 58 +++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 759316143a211..8598aefeba4dc 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -662,9 +662,9 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, WARN_ON(pmd_bad(pmd)); } -static void __init pte_special_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_special_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) return; @@ -673,9 +673,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pte_special(pte_mkspecial(pte))); } -static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_protnone_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -686,7 +686,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -697,25 +697,25 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD protnone\n"); - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none)); WARN_ON(!pmd_protnone(pmd)); WARN_ON(!pmd_present(pmd)); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); pr_debug("Validating PTE devmap\n"); WARN_ON(!pte_devmap(pte_mkdevmap(pte))); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -723,12 +723,12 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD devmap\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -736,20 +736,20 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD devmap\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); WARN_ON(!pud_devmap(pud_mkdevmap(pud))); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else -static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) @@ -1227,7 +1227,7 @@ static int __init debug_vm_pgtable(void) pmd_t *pmdp, *saved_pmdp, pmd; pte_t *ptep; pgtable_t saved_ptep; - pgprot_t prot, protnone; + pgprot_t prot; phys_addr_t paddr; unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned; @@ -1247,12 +1247,6 @@ static int __init debug_vm_pgtable(void) return 1; } - /* - * __P000 (or even __S000) will help create page table entries with - * PROT_NONE permission as required for pxx_protnone_tests(). - */ - protnone = __P000; - vma = vm_area_alloc(mm); if (!vma) { pr_err("vma allocation failed\n"); @@ -1327,13 +1321,13 @@ static int __init debug_vm_pgtable(void) pte_savedwrite_tests(&args); pmd_savedwrite_tests(&args); - pte_special_tests(pte_aligned, prot); - pte_protnone_tests(pte_aligned, protnone); - pmd_protnone_tests(pmd_aligned, protnone); + pte_special_tests(&args); + pte_protnone_tests(&args); + pmd_protnone_tests(&args); - pte_devmap_tests(pte_aligned, prot); - pmd_devmap_tests(pmd_aligned, prot); - pud_devmap_tests(pud_aligned, prot); + pte_devmap_tests(&args); + pmd_devmap_tests(&args); + pud_devmap_tests(&args); pte_soft_dirty_tests(pte_aligned, prot); pmd_soft_dirty_tests(pmd_aligned, prot); From 373e2b570fcf1d4752393c58da2bd7d16ce182ef Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:04 +1000 Subject: [PATCH 057/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in soft_dirty and swap tests This uses struct pgtable_debug_args in the soft_dirty and swap test functions. Link: https://lkml.kernel.org/r/20210809092631.1888748-6-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 48 +++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 8598aefeba4dc..849081a2099de 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -752,9 +752,9 @@ static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ -static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; @@ -764,9 +764,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte))); } -static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; @@ -777,7 +777,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -788,12 +788,12 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD soft dirty\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd))); WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } -static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -805,31 +805,29 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) -{ -} +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; pte_t pte; pr_debug("Validating PTE swap\n"); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); swp = __pte_to_swp_entry(pte); pte = __swp_entry_to_pte(swp); - WARN_ON(pfn != pte_pfn(pte)); + WARN_ON(args->fixed_pte_pfn != pte_pfn(pte)); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; pmd_t pmd; @@ -838,13 +836,13 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD swap\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); swp = __pmd_to_swp_entry(pmd); pmd = __swp_entry_to_pmd(swp); - WARN_ON(pfn != pmd_pfn(pmd)); + WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd)); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init swap_migration_tests(void) @@ -1329,13 +1327,13 @@ static int __init debug_vm_pgtable(void) pmd_devmap_tests(&args); pud_devmap_tests(&args); - pte_soft_dirty_tests(pte_aligned, prot); - pmd_soft_dirty_tests(pmd_aligned, prot); - pte_swap_soft_dirty_tests(pte_aligned, prot); - pmd_swap_soft_dirty_tests(pmd_aligned, prot); + pte_soft_dirty_tests(&args); + pmd_soft_dirty_tests(&args); + pte_swap_soft_dirty_tests(&args); + pmd_swap_soft_dirty_tests(&args); - pte_swap_tests(pte_aligned, prot); - pmd_swap_tests(pmd_aligned, prot); + pte_swap_tests(&args); + pmd_swap_tests(&args); swap_migration_tests(); From 045a544235fb3b8284d7503247b739449ef5686f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:04 +1000 Subject: [PATCH 058/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in migration and thp tests This uses struct pgtable_debug_args in the migration and thp test functions. It's notable that the pre-allocated page is used in swap_migration_tests() as set_pte_at() is used there. Link: https://lkml.kernel.org/r/20210809092631.1888748-7-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 849081a2099de..6df86555b1916 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -845,7 +845,7 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args) static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init swap_migration_tests(void) +static void __init swap_migration_tests(struct pgtable_debug_args *args) { struct page *page; swp_entry_t swp; @@ -853,19 +853,18 @@ static void __init swap_migration_tests(void) if (!IS_ENABLED(CONFIG_MIGRATION)) return; - pr_debug("Validating swap migration\n"); /* * swap_migration_tests() requires a dedicated page as it needs to * be locked before creating a migration entry from it. Locking the * page that actually maps kernel text ('start_kernel') can be real - * problematic. Lets allocate a dedicated page explicitly for this - * purpose that will be freed subsequently. + * problematic. Lets use the allocated page explicitly for this + * purpose. */ - page = alloc_page(GFP_KERNEL); - if (!page) { - pr_err("page allocation failed\n"); + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; - } + + pr_debug("Validating swap migration\n"); /* * make_migration_entry() expects given page to be @@ -884,7 +883,6 @@ static void __init swap_migration_tests(void) WARN_ON(!is_migration_entry(swp)); WARN_ON(is_writable_migration_entry(swp)); __ClearPageLocked(page); - __free_page(page); } #ifdef CONFIG_HUGETLB_PAGE @@ -916,7 +914,7 @@ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -935,7 +933,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) * needs to return true. pmd_present() should be true whenever * pmd_trans_huge() returns true. */ - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd))); #ifndef __HAVE_ARCH_PMDP_INVALIDATE @@ -945,7 +943,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_thp_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -953,7 +951,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD based THP\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); WARN_ON(!pud_trans_huge(pud_mkhuge(pud))); /* @@ -965,11 +963,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) */ } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static unsigned long __init get_random_vaddr(void) @@ -1335,10 +1333,10 @@ static int __init debug_vm_pgtable(void) pte_swap_tests(&args); pmd_swap_tests(&args); - swap_migration_tests(); + swap_migration_tests(&args); - pmd_thp_tests(pmd_aligned, prot); - pud_thp_tests(pud_aligned, prot); + pmd_thp_tests(&args); + pud_thp_tests(&args); hugetlb_basic_tests(&args); From 01fbd93fbd318a562bdab6b8679c8ab48d3d5547 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:04 +1000 Subject: [PATCH 059/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in PTE modifying tests This uses struct pgtable_debug_args in PTE modifying tests. The allocated page is used as set_pte_at() is used there. The tests are skipped if the allocated page doesn't exist. It's notable that args->ptep need to be mapped before the tests. The reason why we don't map args->ptep at the beginning is PTE entry is only mapped and accessible in atomic context when CONFIG_HIGHPTE is enabled. So we avoid to do that so that atomic context is only enabled if needed. Besides, the unused variable @pte_aligned and @ptep in debug_vm_pgtable() are dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-8-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 67 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 6df86555b1916..652f26f5ecd6e 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -117,10 +117,7 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); } -static void __init pte_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pte_t *ptep, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pte_advanced_tests(struct pgtable_debug_args *args) { pte_t pte; @@ -129,35 +126,37 @@ static void __init pte_advanced_tests(struct mm_struct *mm, * This requires set_pte_at to be not used to update an * existing pte entry. Clear pte before we do set_pte_at */ + if (args->pte_pfn == ULONG_MAX) + return; pr_debug("Validating PTE advanced\n"); - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); - ptep_set_wrprotect(mm, vaddr, ptep); - pte = ptep_get(ptep); + pte = pfn_pte(args->pte_pfn, args->page_prot); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + ptep_set_wrprotect(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(pte_write(pte)); - ptep_get_and_clear(mm, vaddr, ptep); - pte = ptep_get(ptep); + ptep_get_and_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_wrprotect(pte); pte = pte_mkclean(pte); - set_pte_at(mm, vaddr, ptep, pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); pte = pte_mkwrite(pte); pte = pte_mkdirty(pte); - ptep_set_access_flags(vma, vaddr, ptep, pte, 1); - pte = ptep_get(ptep); + ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); + pte = ptep_get(args->ptep); WARN_ON(!(pte_write(pte) && pte_dirty(pte))); - ptep_get_and_clear_full(mm, vaddr, ptep, 1); - pte = ptep_get(ptep); + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_mkyoung(pte); - set_pte_at(mm, vaddr, ptep, pte); - ptep_test_and_clear_young(vma, vaddr, ptep); - pte = ptep_get(ptep); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); } @@ -618,20 +617,21 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, } #endif /* PAGETABLE_P4D_FOLDED */ -static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pte_clear_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->pte_pfn, args->page_prot); + + if (args->pte_pfn == ULONG_MAX) + return; pr_debug("Validating PTE clear\n"); #ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); #endif - set_pte_at(mm, vaddr, ptep, pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); barrier(); - pte_clear(mm, vaddr, ptep); - pte = ptep_get(ptep); + pte_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } @@ -1221,11 +1221,10 @@ static int __init debug_vm_pgtable(void) p4d_t *p4dp, *saved_p4dp; pud_t *pudp, *saved_pudp; pmd_t *pmdp, *saved_pmdp, pmd; - pte_t *ptep; pgtable_t saved_ptep; pgprot_t prot; phys_addr_t paddr; - unsigned long vaddr, pte_aligned, pmd_aligned; + unsigned long vaddr, pmd_aligned; unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1260,10 +1259,8 @@ static int __init debug_vm_pgtable(void) */ paddr = __pa_symbol(&start_kernel); - pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - WARN_ON(!pfn_valid(pte_aligned)); pgdp = pgd_offset(mm, vaddr); p4dp = p4d_alloc(mm, pgdp, vaddr); @@ -1345,10 +1342,10 @@ static int __init debug_vm_pgtable(void) * proper page table lock. */ - ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl); - pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot); - pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pte_unmap_unlock(ptep, ptl); + args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl); + pte_clear_tests(&args); + pte_advanced_tests(&args); + pte_unmap_unlock(args.ptep, ptl); ptl = pmd_lock(mm, pmdp); pmd_clear_tests(mm, pmdp); From a9981d79371c163a32426e1e94634a405c6bd2bf Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:04 +1000 Subject: [PATCH 060/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in PMD modifying tests This uses struct pgtable_debug_args in PMD modifying tests. The allocated huge page is used when set_pmd_at() is used. The corresponding tests are skipped if the huge page doesn't exist. Besides, the unused variable @pmd_aligned in debug_vm_pgtable() is dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-9-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 98 ++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 52 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 652f26f5ecd6e..abf778f729fdc 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -211,54 +211,55 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); } -static void __init pmd_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pmd_t *pmdp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot, pgtable_t pgtable) +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { pmd_t pmd; + unsigned long vaddr = args->vaddr; if (!has_transparent_hugepage()) return; + if (args->pmd_pfn == ULONG_MAX) + return; + pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ vaddr &= HPAGE_PMD_MASK; - pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep); - pmd = pfn_pmd(pfn, prot); - set_pmd_at(mm, vaddr, pmdp, pmd); - pmdp_set_wrprotect(mm, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_write(pmd)); - pmdp_huge_get_and_clear(mm, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); - set_pmd_at(mm, vaddr, pmdp, pmd); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); pmd = pmd_mkwrite(pmd); pmd = pmd_mkdirty(pmd); - pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1); - pmd = READ_ONCE(*pmdp); + pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); - pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1); - pmd = READ_ONCE(*pmdp); + pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); pmd = pmd_mkyoung(pmd); - set_pmd_at(mm, vaddr, pmdp, pmd); - pmdp_test_and_clear_young(vma, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_young(pmd)); /* Clear the pte entries */ - pmdp_huge_get_and_clear(mm, vaddr, pmdp); - pgtable = pgtable_trans_huge_withdraw(mm, pmdp); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pgtable_trans_huge_withdraw(args->mm, args->pmdp); } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) @@ -415,12 +416,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } -static void __init pmd_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pmd_t *pmdp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot, pgtable_t pgtable) -{ -} +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pudp, unsigned long pfn, unsigned long vaddr, @@ -433,11 +429,11 @@ static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!arch_vmap_pmd_supported(prot)) + if (!arch_vmap_pmd_supported(args->page_prot)) return; pr_debug("Validating PMD huge\n"); @@ -445,10 +441,10 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) * X86 defined pmd_set_huge() verifies that the given * PMD is not a populated non-leaf entry. */ - WRITE_ONCE(*pmdp, __pmd(0)); - WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pmd_clear_huge(pmdp)); - pmd = READ_ONCE(*pmdp); + WRITE_ONCE(*args->pmdp, __pmd(0)); + WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); + WARN_ON(!pmd_clear_huge(args->pmdp)); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); } @@ -471,7 +467,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { } static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ @@ -635,20 +631,19 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) WARN_ON(!pte_none(pte)); } -static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +static void __init pmd_clear_tests(struct pgtable_debug_args *args) { - pmd_t pmd = READ_ONCE(*pmdp); + pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*pmdp, pmd); - pmd_clear(pmdp); - pmd = READ_ONCE(*pmdp); + WRITE_ONCE(*args->pmdp, pmd); + pmd_clear(args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); } -static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) +static void __init pmd_populate_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -657,8 +652,8 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_populate(mm, pmdp, pgtable); - pmd = READ_ONCE(*pmdp); + pmd_populate(args->mm, args->pmdp, args->start_ptep); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_bad(pmd)); } @@ -1224,7 +1219,7 @@ static int __init debug_vm_pgtable(void) pgtable_t saved_ptep; pgprot_t prot; phys_addr_t paddr; - unsigned long vaddr, pmd_aligned; + unsigned long vaddr; unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1259,7 +1254,6 @@ static int __init debug_vm_pgtable(void) */ paddr = __pa_symbol(&start_kernel); - pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; pgdp = pgd_offset(mm, vaddr); @@ -1347,11 +1341,11 @@ static int __init debug_vm_pgtable(void) pte_advanced_tests(&args); pte_unmap_unlock(args.ptep, ptl); - ptl = pmd_lock(mm, pmdp); - pmd_clear_tests(mm, pmdp); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); - pmd_huge_tests(pmdp, pmd_aligned, prot); - pmd_populate_tests(mm, pmdp, saved_ptep); + ptl = pmd_lock(args.mm, args.pmdp); + pmd_clear_tests(&args); + pmd_advanced_tests(&args); + pmd_huge_tests(&args); + pmd_populate_tests(&args); spin_unlock(ptl); ptl = pud_lock(mm, pudp); From fd102d17a5c334f26409c618ecf668611d67c214 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:05 +1000 Subject: [PATCH 061/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in PUD modifying tests This uses struct pgtable_debug_args in PUD modifying tests. The allocated huge page is used when set_pud_at() is used. The corresponding tests are skipped if the huge page doesn't exist. Besides, the following unused variables in debug_vm_pgtable() are dropped: @prot, @paddr, @pud_aligned. Link: https://lkml.kernel.org/r/20210809092631.1888748-10-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 126 ++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 78 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index abf778f729fdc..4e453ae0f2912 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -337,55 +337,56 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pud_bad(pud_mkhuge(pud))); } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { + unsigned long vaddr = args->vaddr; pud_t pud; if (!has_transparent_hugepage()) return; + if (args->pud_pfn == ULONG_MAX) + return; + pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ vaddr &= HPAGE_PUD_MASK; - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); - pudp_set_wrprotect(mm, vaddr, pudp); - pud = READ_ONCE(*pudp); + pud = pfn_pud(args->pud_pfn, args->page_prot); + set_pud_at(args->mm, vaddr, args->pudp, pud); + pudp_set_wrprotect(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear(mm, vaddr, pudp); - pud = READ_ONCE(*pudp); + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_wrprotect(pud); pud = pud_mkclean(pud); - set_pud_at(mm, vaddr, pudp, pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); - pudp_set_access_flags(vma, vaddr, pudp, pud, 1); - pud = READ_ONCE(*pudp); + pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); + pud = READ_ONCE(*args->pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); - pud = READ_ONCE(*pudp); + pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_mkyoung(pud); - set_pud_at(mm, vaddr, pudp, pud); - pudp_test_and_clear_young(vma, vaddr, pudp); - pud = READ_ONCE(*pudp); + set_pud_at(args->mm, vaddr, args->pudp, pud); + pudp_test_and_clear_young(args->vma, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_young(pud)); - pudp_huge_get_and_clear(mm, vaddr, pudp); + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); } static void __init pud_leaf_tests(struct pgtable_debug_args *args) @@ -405,24 +406,14 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args) } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) -{ -} +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) -{ -} +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } @@ -448,11 +439,11 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) WARN_ON(!pmd_none(pmd)); } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) +static void __init pud_huge_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!arch_vmap_pud_supported(prot)) + if (!arch_vmap_pud_supported(args->page_prot)) return; pr_debug("Validating PUD huge\n"); @@ -460,15 +451,15 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) * X86 defined pud_set_huge() verifies that the given * PUD is not a populated non-leaf entry. */ - WRITE_ONCE(*pudp, __pud(0)); - WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pud_clear_huge(pudp)); - pud = READ_ONCE(*pudp); + WRITE_ONCE(*args->pudp, __pud(0)); + WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); + WARN_ON(!pud_clear_huge(args->pudp)); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init pmd_huge_tests(struct pgtable_debug_args *args) { } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +static void __init pud_huge_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init p4d_basic_tests(struct pgtable_debug_args *args) @@ -490,27 +481,26 @@ static void __init pgd_basic_tests(struct pgtable_debug_args *args) } #ifndef __PAGETABLE_PUD_FOLDED -static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +static void __init pud_clear_tests(struct pgtable_debug_args *args) { - pud_t pud = READ_ONCE(*pudp); + pud_t pud = READ_ONCE(*args->pudp); - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; pr_debug("Validating PUD clear\n"); pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*pudp, pud); - pud_clear(pudp); - pud = READ_ONCE(*pudp); + WRITE_ONCE(*args->pudp, pud); + pud_clear(args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); } -static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, - pmd_t *pmdp) +static void __init pud_populate_tests(struct pgtable_debug_args *args) { pud_t pud; - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; pr_debug("Validating PUD populate\n"); @@ -518,16 +508,13 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pud_populate(mm, pudp, pmdp); - pud = READ_ONCE(*pudp); + pud_populate(args->mm, args->pudp, args->start_pmdp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_bad(pud)); } #else /* !__PAGETABLE_PUD_FOLDED */ -static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } -static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, - pmd_t *pmdp) -{ -} +static void __init pud_clear_tests(struct pgtable_debug_args *args) { } +static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_P4D_FOLDED @@ -1217,10 +1204,7 @@ static int __init debug_vm_pgtable(void) pud_t *pudp, *saved_pudp; pmd_t *pmdp, *saved_pmdp, pmd; pgtable_t saved_ptep; - pgprot_t prot; - phys_addr_t paddr; unsigned long vaddr; - unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1229,7 +1213,6 @@ static int __init debug_vm_pgtable(void) if (ret) return ret; - prot = vm_get_page_prot(VMFLAGS); vaddr = get_random_vaddr(); mm = mm_alloc(); if (!mm) { @@ -1243,19 +1226,6 @@ static int __init debug_vm_pgtable(void) return 1; } - /* - * PFN for mapping at PTE level is determined from a standard kernel - * text symbol. But pfns for higher page table levels are derived by - * masking lower bits of this real pfn. These derived pfns might not - * exist on the platform but that does not really matter as pfn_pxx() - * helpers will still create appropriate entries for the test. This - * helps avoid large memory block allocations to be used for mapping - * at higher page table levels. - */ - paddr = __pa_symbol(&start_kernel); - - pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - pgdp = pgd_offset(mm, vaddr); p4dp = p4d_alloc(mm, pgdp, vaddr); pudp = pud_alloc(mm, p4dp, vaddr); @@ -1348,11 +1318,11 @@ static int __init debug_vm_pgtable(void) pmd_populate_tests(&args); spin_unlock(ptl); - ptl = pud_lock(mm, pudp); - pud_clear_tests(mm, pudp); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - pud_huge_tests(pudp, pud_aligned, prot); - pud_populate_tests(mm, pudp, saved_pmdp); + ptl = pud_lock(args.mm, args.pudp); + pud_clear_tests(&args); + pud_advanced_tests(&args); + pud_huge_tests(&args); + pud_populate_tests(&args); spin_unlock(ptl); spin_lock(&mm->page_table_lock); From 6a147c2f9967eba8a2566e7ecdad056517deb432 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:05 +1000 Subject: [PATCH 062/365] mm/debug_vm_pgtable: use struct pgtable_debug_args in PGD and P4D modifying tests This uses struct pgtable_debug_args in PGD/P4D modifying tests. No allocated huge page is used in these tests. Besides, the unused variable @saved_p4dp and @saved_pudp are dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-11-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 86 +++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4e453ae0f2912..9e932ae9f1580 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -518,27 +518,26 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_P4D_FOLDED -static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { - p4d_t p4d = READ_ONCE(*p4dp); + p4d_t p4d = READ_ONCE(*args->p4dp); - if (mm_pud_folded(mm)) + if (mm_pud_folded(args->mm)) return; pr_debug("Validating P4D clear\n"); p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*p4dp, p4d); - p4d_clear(p4dp); - p4d = READ_ONCE(*p4dp); + WRITE_ONCE(*args->p4dp, p4d); + p4d_clear(args->p4dp); + p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); } -static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, - pud_t *pudp) +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { p4d_t p4d; - if (mm_pud_folded(mm)) + if (mm_pud_folded(args->mm)) return; pr_debug("Validating P4D populate\n"); @@ -546,34 +545,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, * This entry points to next level page table page. * Hence this must not qualify as p4d_bad(). */ - pud_clear(pudp); - p4d_clear(p4dp); - p4d_populate(mm, p4dp, pudp); - p4d = READ_ONCE(*p4dp); + pud_clear(args->pudp); + p4d_clear(args->p4dp); + p4d_populate(args->mm, args->p4dp, args->start_pudp); + p4d = READ_ONCE(*args->p4dp); WARN_ON(p4d_bad(p4d)); } -static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { - pgd_t pgd = READ_ONCE(*pgdp); + pgd_t pgd = READ_ONCE(*(args->pgdp)); - if (mm_p4d_folded(mm)) + if (mm_p4d_folded(args->mm)) return; pr_debug("Validating PGD clear\n"); pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*pgdp, pgd); - pgd_clear(pgdp); - pgd = READ_ONCE(*pgdp); + WRITE_ONCE(*args->pgdp, pgd); + pgd_clear(args->pgdp); + pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); } -static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, - p4d_t *p4dp) +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { pgd_t pgd; - if (mm_p4d_folded(mm)) + if (mm_p4d_folded(args->mm)) return; pr_debug("Validating PGD populate\n"); @@ -581,23 +579,17 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, * This entry points to next level page table page. * Hence this must not qualify as pgd_bad(). */ - p4d_clear(p4dp); - pgd_clear(pgdp); - pgd_populate(mm, pgdp, p4dp); - pgd = READ_ONCE(*pgdp); + p4d_clear(args->p4dp); + pgd_clear(args->pgdp); + pgd_populate(args->mm, args->pgdp, args->start_p4dp); + pgd = READ_ONCE(*args->pgdp); WARN_ON(pgd_bad(pgd)); } #else /* !__PAGETABLE_P4D_FOLDED */ -static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } -static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } -static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, - pud_t *pudp) -{ -} -static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, - p4d_t *p4dp) -{ -} +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { } +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { } +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { } +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_P4D_FOLDED */ static void __init pte_clear_tests(struct pgtable_debug_args *args) @@ -1200,8 +1192,8 @@ static int __init debug_vm_pgtable(void) struct vm_area_struct *vma; struct mm_struct *mm; pgd_t *pgdp; - p4d_t *p4dp, *saved_p4dp; - pud_t *pudp, *saved_pudp; + p4d_t *p4dp; + pud_t *pudp; pmd_t *pmdp, *saved_pmdp, pmd; pgtable_t saved_ptep; unsigned long vaddr; @@ -1245,8 +1237,6 @@ static int __init debug_vm_pgtable(void) * page table pages. */ pmd = READ_ONCE(*pmdp); - saved_p4dp = p4d_offset(pgdp, 0UL); - saved_pudp = pud_offset(p4dp, 0UL); saved_pmdp = pmd_offset(pudp, 0UL); saved_ptep = pmd_pgtable(pmd); @@ -1325,15 +1315,15 @@ static int __init debug_vm_pgtable(void) pud_populate_tests(&args); spin_unlock(ptl); - spin_lock(&mm->page_table_lock); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); - p4d_populate_tests(mm, p4dp, saved_pudp); - pgd_populate_tests(mm, pgdp, saved_p4dp); - spin_unlock(&mm->page_table_lock); + spin_lock(&(args.mm->page_table_lock)); + p4d_clear_tests(&args); + pgd_clear_tests(&args); + p4d_populate_tests(&args); + pgd_populate_tests(&args); + spin_unlock(&(args.mm->page_table_lock)); - p4d_free(mm, saved_p4dp); - pud_free(mm, saved_pudp); + p4d_free(mm, p4d_offset(pgdp, 0UL)); + pud_free(mm, pud_offset(p4dp, 0UL)); pmd_free(mm, saved_pmdp); pte_free(mm, saved_ptep); From cf2dc078981fe2c135e2123ef2c0db3757246f81 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:05 +1000 Subject: [PATCH 063/365] mm/debug_vm_pgtable: remove unused code The variables used by old implementation isn't needed as we switched to "struct pgtable_debug_args". Lets remove them and related code in debug_vm_pgtable(). Link: https://lkml.kernel.org/r/20210809092631.1888748-12-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 54 ------------------------------------------- 1 file changed, 54 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 9e932ae9f1580..4409e30adcf65 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1189,14 +1189,6 @@ static int __init init_args(struct pgtable_debug_args *args) static int __init debug_vm_pgtable(void) { struct pgtable_debug_args args; - struct vm_area_struct *vma; - struct mm_struct *mm; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp, *saved_pmdp, pmd; - pgtable_t saved_ptep; - unsigned long vaddr; spinlock_t *ptl = NULL; int idx, ret; @@ -1205,41 +1197,6 @@ static int __init debug_vm_pgtable(void) if (ret) return ret; - vaddr = get_random_vaddr(); - mm = mm_alloc(); - if (!mm) { - pr_err("mm_struct allocation failed\n"); - return 1; - } - - vma = vm_area_alloc(mm); - if (!vma) { - pr_err("vma allocation failed\n"); - return 1; - } - - pgdp = pgd_offset(mm, vaddr); - p4dp = p4d_alloc(mm, pgdp, vaddr); - pudp = pud_alloc(mm, p4dp, vaddr); - pmdp = pmd_alloc(mm, pudp, vaddr); - /* - * Allocate pgtable_t - */ - if (pte_alloc(mm, pmdp)) { - pr_err("pgtable allocation failed\n"); - return 1; - } - - /* - * Save all the page table page addresses as the page table - * entries will be used for testing with random or garbage - * values. These saved addresses will be used for freeing - * page table pages. - */ - pmd = READ_ONCE(*pmdp); - saved_pmdp = pmd_offset(pudp, 0UL); - saved_ptep = pmd_pgtable(pmd); - /* * Iterate over the protection_map[] to make sure that all * the basic page table transformation validations just hold @@ -1322,17 +1279,6 @@ static int __init debug_vm_pgtable(void) pgd_populate_tests(&args); spin_unlock(&(args.mm->page_table_lock)); - p4d_free(mm, p4d_offset(pgdp, 0UL)); - pud_free(mm, pud_offset(p4dp, 0UL)); - pmd_free(mm, saved_pmdp); - pte_free(mm, saved_ptep); - - vm_area_free(vma); - mm_dec_nr_puds(mm); - mm_dec_nr_pmds(mm); - mm_dec_nr_ptes(mm); - mmdrop(mm); - destroy_args(&args); return 0; } From 813e68a8c38b85b254a3580d3ffbb1eb1cfb36a2 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 24 Aug 2021 09:59:05 +1000 Subject: [PATCH 064/365] mm/debug_vm_pgtable: fix corrupted page flag In page table entry modifying tests, set_xxx_at() are used to populate the page table entries. On ARM64, PG_arch_1 (PG_dcache_clean) flag is set to the target page flag if execution permission is given. The logic exits since commit 4f04d8f00545 ("arm64: MMU definitions"). The page flag is kept when the page is free'd to buddy's free area list. However, it will trigger page checking failure when it's pulled from the buddy's free area list, as the following warning messages indicate. BUG: Bad page state in process memhog pfn:08000 page:0000000015c0a628 refcount:0 mapcount:0 \ mapping:0000000000000000 index:0x1 pfn:0x8000 flags: 0x7ffff8000000800(arch_1|node=0|zone=0|lastcpupid=0xfffff) raw: 07ffff8000000800 dead000000000100 dead000000000122 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag(s) set This fixes the issue by clearing PG_arch_1 through flush_dcache_page() after set_xxx_at() is called. For architectures other than ARM64, the unexpected overhead of cache flushing is acceptable. Link: https://lkml.kernel.org/r/20210809092631.1888748-13-gshan@redhat.com Fixes: a5c3b9ffb0f4 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers") Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 55 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4409e30adcf65..1403639302e48 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -29,6 +29,8 @@ #include #include #include + +#include #include #include @@ -119,19 +121,28 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) static void __init pte_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; pte_t pte; /* * Architectures optimize set_pte_at by avoiding TLB flush. * This requires set_pte_at to be not used to update an * existing pte entry. Clear pte before we do set_pte_at + * + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. */ - if (args->pte_pfn == ULONG_MAX) + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; pr_debug("Validating PTE advanced\n"); pte = pfn_pte(args->pte_pfn, args->page_prot); set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); ptep_set_wrprotect(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_write(pte)); @@ -143,6 +154,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); pte = pte_mkwrite(pte); pte = pte_mkdirty(pte); ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); @@ -155,6 +167,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_mkyoung(pte); set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); @@ -213,15 +226,24 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; pmd_t pmd; unsigned long vaddr = args->vaddr; if (!has_transparent_hugepage()) return; - if (args->pmd_pfn == ULONG_MAX) + page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL; + if (!page) return; + /* + * flush_dcache_page() is called after set_pmd_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ vaddr &= HPAGE_PMD_MASK; @@ -230,6 +252,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pfn_pmd(args->pmd_pfn, args->page_prot); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_write(pmd)); @@ -241,6 +264,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmd = pmd_mkwrite(pmd); pmd = pmd_mkdirty(pmd); pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); @@ -253,6 +277,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); pmd = pmd_mkyoung(pmd); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_young(pmd)); @@ -339,21 +364,31 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) static void __init pud_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; unsigned long vaddr = args->vaddr; pud_t pud; if (!has_transparent_hugepage()) return; - if (args->pud_pfn == ULONG_MAX) + page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL; + if (!page) return; + /* + * flush_dcache_page() is called after set_pud_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ vaddr &= HPAGE_PUD_MASK; pud = pfn_pud(args->pud_pfn, args->page_prot); set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pudp_set_wrprotect(args->mm, vaddr, args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(pud_write(pud)); @@ -367,6 +402,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) pud = pud_wrprotect(pud); pud = pud_mkclean(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); @@ -382,6 +418,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_mkyoung(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pudp_test_and_clear_young(args->vma, vaddr, args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(pud_young(pud)); @@ -594,16 +631,26 @@ static void __init pgd_populate_tests(struct pgtable_debug_args *args) { } static void __init pte_clear_tests(struct pgtable_debug_args *args) { + struct page *page; pte_t pte = pfn_pte(args->pte_pfn, args->page_prot); - if (args->pte_pfn == ULONG_MAX) + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; + /* + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PTE clear\n"); #ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); #endif set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); barrier(); pte_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); From 5eaefecd845b89cef85e050b9c4591b2a0c48171 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 24 Aug 2021 09:59:05 +1000 Subject: [PATCH 065/365] mm: report a more useful address for reclaim acquisition A recent lockdep report included these lines: [ 96.177910] 3 locks held by containerd/770: [ 96.177934] #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x115/0x770 [ 96.177999] #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at: get_swap_device+0x33/0x140 [ 96.178057] #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 While it was not useful to that bug report to know where the reclaim lock had been acquired, it might be useful under other circumstances. Allow the caller of __fs_reclaim_acquire to specify the instruction pointer to use. Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Boqun Feng Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched/mm.h | 8 ++++---- mm/page_alloc.c | 12 ++++++------ mm/vmscan.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index e24b1fe348e3b..8894825cc4db2 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP -extern void __fs_reclaim_acquire(void); -extern void __fs_reclaim_release(void); +extern void __fs_reclaim_acquire(unsigned long ip); +extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else -static inline void __fs_reclaim_acquire(void) { } -static inline void __fs_reclaim_release(void) { } +static inline void __fs_reclaim_acquire(unsigned long ip) { } +static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eeb3a9cb36bb4..51c17bf7b127f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4549,14 +4549,14 @@ static bool __need_reclaim(gfp_t gfp_mask) return true; } -void __fs_reclaim_acquire(void) +void __fs_reclaim_acquire(unsigned long ip) { - lock_map_acquire(&__fs_reclaim_map); + lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); } -void __fs_reclaim_release(void) +void __fs_reclaim_release(unsigned long ip) { - lock_map_release(&__fs_reclaim_map); + lock_release(&__fs_reclaim_map, ip); } void fs_reclaim_acquire(gfp_t gfp_mask) @@ -4565,7 +4565,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_RET_IP_); #ifdef CONFIG_MMU_NOTIFIER lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); @@ -4582,7 +4582,7 @@ void fs_reclaim_release(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_release(); + __fs_reclaim_release(_RET_IP_); } } EXPORT_SYMBOL_GPL(fs_reclaim_release); diff --git a/mm/vmscan.c b/mm/vmscan.c index eeae2f6bc5320..17c4b3fdd7dde 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3812,7 +3812,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); @@ -3938,9 +3938,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); if (ret || kthread_should_stop()) break; @@ -3992,7 +3992,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) } snapshot_refaults(NULL, pgdat); - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); From ddef359674445dc2229717475f459e6b705fa4ad Mon Sep 17 00:00:00 2001 From: liuhailong Date: Tue, 24 Aug 2021 09:59:06 +1000 Subject: [PATCH 066/365] mm: add kernel_misc_reclaimable in show_free_areas Print NR_KERNEL_MISC_RECLAIMABLE stat from show_free_areas() so users can check whether the shrinker is working correctly and to show the current memory usage. Link: https://lkml.kernel.org/r/20210813104725.4562-1-liuhailong@oppo.com Signed-off-by: liuhailong Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 51c17bf7b127f..60a867a549bdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5903,6 +5903,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), @@ -5919,6 +5920,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); From 21acac9b10d3646c64b9b064459e251f72237ce1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 24 Aug 2021 09:59:06 +1000 Subject: [PATCH 067/365] mm: mark idle page tracking as BROKEN In discussion with other MM developers around how idle page tracking should be fixed for transparent huge pages, several expressed the opinion that it should be removed as it is inefficient at accomplishing the job that it is supposed to, and we have better mechanisms (eg uffd) for accomplishing the same goals these days. Mark the feature as BROKEN for now and we can remove it entirely in a few months if nobody complains. It is not enabled by Android, ChromeOS, Debian, Fedora or SUSE. Red Hat enabled it with RHEL-8.1 and UEK followed suit, but I have been unable to find why RHEL enabled it. Link: https://lkml.kernel.org/r/20210612000714.775825-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Yu Zhao Acked-by: Kirill A. Shutemov SeongJae Park Cc: Heiko Carstens Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 40a9bfcd5062e..5dc28e9205e00 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -744,7 +744,7 @@ config DEFERRED_STRUCT_PAGE_INIT config IDLE_PAGE_TRACKING bool "Enable idle page tracking" - depends on SYSFS && MMU + depends on SYSFS && MMU && BROKEN select PAGE_EXTENSION if !64BIT help This feature allows to estimate the amount of user pages that have From 1a26c8aaf2793e669eafaaa5776f14f104b20066 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2021 09:59:06 +1000 Subject: [PATCH 068/365] writeback: track number of inodes under writeback Patch series "writeback: Fix bandwidth estimates", v4. Fix estimate of writeback throughput when device is not fully busy doing writeback. Michael Stapelberg has reported that such workload (e.g. generated by linking) tends to push estimated throughput down to 0 and as a result writeback on the device is practically stalled. The first three patches fix the reported issue, the remaining two patches are unrelated cleanups of problems I've noticed when reading the code. This patch (of 4): Track number of inodes under writeback for each bdi_writeback structure. We will use this to decide whether wb does any IO and so we can estimate its writeback throughput. In principle we could use number of pages under writeback (WB_WRITEBACK counter) for this however normal percpu counter reads are too inaccurate for our purposes and summing the counter is too expensive. Link: https://lkml.kernel.org/r/20210713104519.16394-1-jack@suse.cz Link: https://lkml.kernel.org/r/20210713104716.22868-1-jack@suse.cz Signed-off-by: Jan Kara Cc: Wu Fengguang Cc: Michael Stapelberg Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fs-writeback.c | 5 +++++ include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 1 + mm/page-writeback.c | 22 ++++++++++++++++++++-- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4c33705489825..7439ecd44ac9e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode, inc_wb_stat(new_wb, WB_WRITEBACK); } + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + atomic_dec(&old_wb->writeback_inodes); + atomic_inc(&new_wb->writeback_inodes); + } + wb_get(new_wb); /* diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1d7edad9914fc..06fb8e13f6bc5 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,7 @@ struct bdi_writeback { struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long congested; /* WB_[a]sync_congested flags */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90ad..b4c707ddedb1b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -293,6 +293,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); + atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c3..e1aa1c9d8e362 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2731,6 +2731,16 @@ int clear_page_dirty_for_io(struct page *page) } EXPORT_SYMBOL(clear_page_dirty_for_io); +static void wb_inode_writeback_start(struct bdi_writeback *wb) +{ + atomic_inc(&wb->writeback_inodes); +} + +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + atomic_dec(&wb->writeback_inodes); +} + int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2752,6 +2762,9 @@ int test_clear_page_writeback(struct page *page) dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); + if (!mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); } } @@ -2794,8 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) - inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + inc_wb_stat(wb, WB_WRITEBACK); + if (!on_wblist) + wb_inode_writeback_start(wb); + } /* * We can come through here when swapping anonymous From 4bd3905f5e13aa4fee56ea3a9adc2704c71a5865 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2021 09:59:06 +1000 Subject: [PATCH 069/365] writeback: reliably update bandwidth estimation Currently we trigger writeback bandwidth estimation from balance_dirty_pages() and from wb_writeback(). However neither of these need to trigger when the system is relatively idle and writeback is triggered e.g. from fsync(2). Make sure writeback estimates happen reliably by triggering them from do_writepages(). Link: https://lkml.kernel.org/r/20210713104716.22868-2-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fs-writeback.c | 3 --- include/linux/backing-dev.h | 19 ++++++++++++++++++ include/linux/writeback.h | 1 - mm/page-writeback.c | 39 +++++++++++++++++++++++++------------ 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7439ecd44ac9e..867984e778c3d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2004,7 +2004,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; @@ -2058,8 +2057,6 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); - wb_update_bandwidth(wb, wb_start); - /* * Did we write something? Try for more * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c1..a5d7d625dcc6f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) return inode->i_wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + /* + * If wbc does not have inode attached, it means cgroup writeback was + * disabled when wbc started. Just use the default wb in that case. + */ + return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; +} + /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode @@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + return inode_to_wb(inode); +} + + static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcfe..2480322c06a76 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,7 +379,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e1aa1c9d8e362..e4a381b8944b6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1332,7 +1332,6 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, - unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; @@ -1352,13 +1351,6 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); - /* - * Skip quiet periods when disk bandwidth is under-utilized. - * (at least 1s idle time between two flusher runs) - */ - if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) - goto snapshot; - if (update_ratelimit) { domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); @@ -1374,17 +1366,36 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, } wb_update_write_bandwidth(wb, elapsed, written); -snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; } -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) +static void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - __wb_update_bandwidth(&gdtc, NULL, start_time, false); + spin_lock(&wb->list_lock); + __wb_update_bandwidth(&gdtc, NULL, false); + spin_unlock(&wb->list_lock); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); + } } /* @@ -1713,7 +1724,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); - __wb_update_bandwidth(gdtc, mdtc, start_time, true); + __wb_update_bandwidth(gdtc, mdtc, true); spin_unlock(&wb->list_lock); } @@ -2347,9 +2358,12 @@ EXPORT_SYMBOL(generic_writepages); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; + struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); @@ -2360,6 +2374,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } + wb_update_bandwidth(wb); return ret; } From f2f3d3bd9aca7b8ad5718ec4d20f52a067b8a947 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2021 09:59:06 +1000 Subject: [PATCH 070/365] writeback: fix bandwidth estimate for spiky workload Michael Stapelberg has reported that for workload with short big spikes of writes (GCC linker seem to trigger this frequently) the write throughput is heavily underestimated and tends to steadily sink until it reaches zero. This has rather bad impact on writeback throttling (causing stalls). The problem is that writeback throughput estimate gets updated at most once per 200 ms. One update happens early after we submit pages for writeback (at that point writeout of only small fraction of pages is completed and thus observed throughput is tiny). Next update happens only during the next write spike (updates happen only from inode writeback and dirty throttling code) and if that is more than 1s after previous spike, we decide system was idle and just ignore whatever was written until this moment. Fix the problem by making sure writeback throughput estimate is also updated shortly after writeback completes to get reasonable estimate of throughput for spiky workloads. Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+linux@google.com Link: https://lkml.kernel.org/r/20210713104716.22868-3-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Michael Stapelberg Tested-by: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + mm/backing-dev.c | 10 +++++++++ mm/page-writeback.c | 35 +++++++++++++++++--------------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 06fb8e13f6bc5..33207004cfded 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -143,6 +143,7 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ unsigned long dirty_sleep; /* last wait */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2480322c06a76..cbaef099645ec 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); +void wb_update_bandwidth(struct bdi_writeback *wb); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b4c707ddedb1b..6122c78ce9146 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void wb_update_bandwidth_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, bw_dwork); + + wb_update_bandwidth(wb); +} + /* * Initial write bandwidth: 100 MB/s */ @@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); @@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e4a381b8944b6..eb55c8882db01 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1340,14 +1340,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, unsigned long dirtied; unsigned long written; - lockdep_assert_held(&wb->list_lock); - - /* - * rate-limit, only update once every 200ms. - */ - if (elapsed < BANDWIDTH_INTERVAL) - return; - + spin_lock(&wb->list_lock); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); @@ -1369,15 +1362,14 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); } -static void wb_update_bandwidth(struct bdi_writeback *wb) +void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - spin_lock(&wb->list_lock); __wb_update_bandwidth(&gdtc, NULL, false); - spin_unlock(&wb->list_lock); } /* Interval after which we consider wb idle and don't estimate bandwidth */ @@ -1722,11 +1714,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, wb->dirty_exceeded = 1; if (time_is_before_jiffies(wb->bw_time_stamp + - BANDWIDTH_INTERVAL)) { - spin_lock(&wb->list_lock); + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); - spin_unlock(&wb->list_lock); - } /* throttle according to the chosen dtc */ dirty_ratelimit = wb->dirty_ratelimit; @@ -2374,7 +2363,13 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } - wb_update_bandwidth(wb); + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } @@ -2754,6 +2749,14 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb) static void wb_inode_writeback_end(struct bdi_writeback *wb) { atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); } int test_clear_page_writeback(struct page *page) From 71c78c869a25efbb515a19a545a29ef15bc2f8df Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2021 09:59:07 +1000 Subject: [PATCH 071/365] writeback: avoid division by 0 in wb_update_dirty_ratelimit() Fixup patch "writeback: Fix bandwidth estimate for spiky workload" which introduced possibility of __wb_update_bandwidth() getting called at a moment when 'elapsed' evaluates to 0. Cc: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page-writeback.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eb55c8882db01..156f5888c09da 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1336,11 +1336,19 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - wb->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; spin_lock(&wb->list_lock); + + /* + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. + */ + elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); From dca30501d55714ed99a4defab9b507f8f5a8f160 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2021 09:59:07 +1000 Subject: [PATCH 072/365] writeback: rename domain_update_bandwidth() Rename domain_update_bandwidth() to domain_update_dirty_limit(). The original name is a misnomer. The function has nothing to do with a bandwidth, it updates dirty limits. Link: https://lkml.kernel.org/r/20210713104716.22868-4-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page-writeback.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 156f5888c09da..7d8ad4260147a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1147,8 +1147,8 @@ static void update_dirty_limit(struct dirty_throttle_control *dtc) dom->dirty_limit = limit; } -static void domain_update_bandwidth(struct dirty_throttle_control *dtc, - unsigned long now) +static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, + unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); @@ -1353,7 +1353,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, written = percpu_counter_read(&wb->stat[WB_WRITTEN]); if (update_ratelimit) { - domain_update_bandwidth(gdtc, now); + domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* @@ -1361,7 +1361,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { - domain_update_bandwidth(mdtc, now); + domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } From 40264e1e5ed4074cb3cacede4404259948de7738 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2021 09:59:07 +1000 Subject: [PATCH 073/365] writeback: use READ_ONCE for unlocked reads of writeback stats We do some unlocked reads of writeback statistics like avg_write_bandwidth, dirty_ratelimit, or bw_time_stamp. Generally we are fine with getting somewhat out-of-date values but actually getting different values in various parts of the functions because the compiler decided to reload value from original memory location could confuse calculations. Use READ_ONCE for these unlocked accesses and WRITE_ONCE for the updates to be on the safe side. Link: https://lkml.kernel.org/r/20210713104716.22868-5-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page-writeback.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7d8ad4260147a..c3b00c6f30ce4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { - unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; @@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; - unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; @@ -1115,7 +1115,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; - wb->avg_write_bandwidth = avg; + WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) @@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, else dirty_ratelimit -= step; - wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); @@ -1369,7 +1369,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, wb->dirtied_stamp = dirtied; wb->written_stamp = written; - wb->bw_time_stamp = now; + WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } @@ -1393,7 +1393,7 @@ static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) spin_lock(&wb->list_lock); wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); wb->written_stamp = wb_stat(wb, WB_WRITTEN); - wb->bw_time_stamp = now; + WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } } @@ -1418,7 +1418,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty, static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { - unsigned long bw = wb->avg_write_bandwidth; + unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* @@ -1440,8 +1440,8 @@ static long wb_min_pause(struct bdi_writeback *wb, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { - long hi = ilog2(wb->avg_write_bandwidth); - long lo = ilog2(wb->dirty_ratelimit); + long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); + long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1721,12 +1721,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; - if (time_is_before_jiffies(wb->bw_time_stamp + + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ - dirty_ratelimit = wb->dirty_ratelimit; + dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); @@ -2376,7 +2376,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) * but if there's constant writeback being submitted, this makes sure * writeback bandwidth is updated once in a while. */ - if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) wb_update_bandwidth(wb); return ret; } From b54b9a0d3771e2d9ad3c5aae33957a4a3ef480d8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 24 Aug 2021 09:59:07 +1000 Subject: [PATCH 074/365] mm: remove irqsave/restore locking from contexts with irqs enabled The page cache deletion paths all have interrupts enabled, so no need to use irqsafe/irqrestore locking variants. They used to have irqs disabled by the memcg lock added in commit c4843a7593a9 ("memcg: add per cgroup dirty page accounting"), but that has since been replaced by memcg taking the page lock instead, commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge AP"). Link: https://lkml.kernel.org/r/20210614211904.14420-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/filemap.c | 15 ++++++--------- mm/truncate.c | 8 +++----- mm/vmscan.c | 9 ++++----- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 34de0b14aaa90..3032fcdf326dd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -258,12 +258,11 @@ static void page_cache_free_page(struct address_space *mapping, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); - unsigned long flags; BUG_ON(!PageLocked(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); __delete_from_page_cache(page, NULL); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); page_cache_free_page(mapping, page); } @@ -335,19 +334,18 @@ void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec) { int i; - unsigned long flags; if (!pagevec_count(pvec)) return; - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_delete_batch(mapping, pvec); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -821,7 +819,6 @@ void replace_page_cache_page(struct page *old, struct page *new) void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); @@ -833,7 +830,7 @@ void replace_page_cache_page(struct page *old, struct page *new) mem_cgroup_migrate(old, new); - xas_lock_irqsave(&xas, flags); + xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; @@ -846,7 +843,7 @@ void replace_page_cache_page(struct page *old, struct page *new) __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) __inc_lruvec_page_state(new, NR_SHMEM); - xas_unlock_irqrestore(&xas, flags); + xas_unlock_irq(&xas); if (freepage) freepage(old); put_page(old); diff --git a/mm/truncate.c b/mm/truncate.c index 234ddd879caa1..2adff8f800bbc 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -560,21 +560,19 @@ void invalidate_mapping_pagevec(struct address_space *mapping, static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - unsigned long flags; - if (page->mapping != mapping) return 0; if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -582,7 +580,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 17c4b3fdd7dde..268ad6570751b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1052,14 +1052,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed, struct mem_cgroup *target_memcg) { - unsigned long flags; int refcount; void *shadow = NULL; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. * @@ -1100,7 +1099,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -1126,7 +1125,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (freepage != NULL) freepage(page); @@ -1135,7 +1134,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } From 2e0fb5261bcfc236de8d828512d7d4c04fa89779 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 24 Aug 2021 09:59:07 +1000 Subject: [PATCH 075/365] fs: drop_caches: fix skipping over shadow cache inodes When drop_caches truncates the page cache in an inode it also includes any shadow entries for evicted pages. However, there is a preliminary check on whether the inode has pages: if it has *only* shadow entries, it will skip running truncation on the inode and leave it behind. Fix the check to mapping_empty(), such that it runs truncation on any inode that has cache entries at all. Link: https://lkml.kernel.org/r/20210614211904.14420-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Roman Gushchin Acked-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/drop_caches.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index f00fcc4a4f721..e619c31b6bd92 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -3,6 +3,7 @@ * Implement the manual drop-all-pagecache function */ +#include #include #include #include @@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) * we need to reschedule to avoid softlockups. */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0 && !need_resched())) { + (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; } From 006057f1bcfd89ac9655e8547427c5c3fd887f20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 24 Aug 2021 09:59:08 +1000 Subject: [PATCH 076/365] fs: inode: count invalidated shadow pages in pginodesteal pginodesteal is supposed to capture the impact that inode reclaim has on the page cache state. Currently, it doesn't consider shadow pages that get dropped this way, even though this can have a significant impact on paging behavior, memory pressure calculations etc. To improve visibility into these effects, make sure shadow pages get counted when they get dropped through inode reclaim. This changes the return value semantics of invalidate_mapping_pages() semantics slightly, but the only two users are the inode shrinker itsel and a usb driver that logs it for debugging purposes. Link: https://lkml.kernel.org/r/20210614211904.14420-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/inode.c | 2 +- mm/truncate.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index c93500d84264d..8830a727b0af0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -768,7 +768,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_ROTATE; } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { + if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); diff --git a/mm/truncate.c b/mm/truncate.c index 2adff8f800bbc..787b35f2cdc16 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -483,8 +483,9 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, index = indices[i]; if (xa_is_value(page)) { - invalidate_exceptional_entry(mapping, index, - page); + count += invalidate_exceptional_entry(mapping, + index, + page); continue; } index += thp_nr_pages(page) - 1; @@ -512,19 +513,18 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, } /** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate + * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode + * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. + * This function removes pages that are clean, unmapped and unlocked, + * as well as shadow entries. It will not block on IO activity. * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. + * If you want to remove all the pages of one inode, regardless of + * their use and writeback state, use truncate_inode_pages(). * - * Return: the number of the pages that were invalidated + * Return: the number of the cache entries that were invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) From 56d657271ff6380d58be0100ef1e40a5e176ff8c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 24 Aug 2021 09:59:08 +1000 Subject: [PATCH 077/365] vfs: keep inodes with page cache off the inode shrinker LRU Historically (pre-2.5), the inode shrinker used to reclaim only empty inodes and skip over those that still contained page cache. This caused problems on highmem hosts: struct inode could put fill lowmem zones before the cache was getting reclaimed in the highmem zones. To address this, the inode shrinker started to strip page cache to facilitate reclaiming lowmem. However, this comes with its own set of problems: the shrinkers may drop actively used page cache just because the inodes are not currently open or dirty - think working with a large git tree. It further doesn't respect cgroup memory protection settings and can cause priority inversions between containers. Nowadays, the page cache also holds non-resident info for evicted cache pages in order to detect refaults. We've come to rely heavily on this data inside reclaim for protecting the cache workingset and driving swap behavior. We also use it to quantify and report workload health through psi. The latter in turn is used for fleet health monitoring, as well as driving automated memory sizing of workloads and containers, proactive reclaim and memory offloading schemes. The consequences of dropping page cache prematurely is that we're seeing subtle and not-so-subtle failures in all of the above-mentioned scenarios, with the workload generally entering unexpected thrashing states while losing the ability to reliably detect it. To fix this on non-highmem systems at least, going back to rotating inodes on the LRU isn't feasible. We've tried (commit a76cf1a474d7 ("mm: don't reclaim inodes with many attached pages")) and failed (commit 69056ee6a8a3 ("Revert "mm: don't reclaim inodes with many attached pages"")). The issue is mostly that shrinker pools attract pressure based on their size, and when objects get skipped the shrinkers remember this as deferred reclaim work. This accumulates excessive pressure on the remaining inodes, and we can quickly eat into heavily used ones, or dirty ones that require IO to reclaim, when there potentially is plenty of cold, clean cache around still. Instead, this patch keeps populated inodes off the inode LRU in the first place - just like an open file or dirty state would. An otherwise clean and unused inode then gets queued when the last cache entry disappears. This solves the problem without reintroducing the reclaim issues, and generally is a bit more scalable than having to wade through potentially hundreds of thousands of busy inodes. Locking is a bit tricky because the locks protecting the inode state (i_lock) and the inode LRU (lru_list.lock) don't nest inside the irq-safe page cache lock (i_pages.xa_lock). Page cache deletions are serialized through i_lock, taken before the i_pages lock, to make sure depopulated inodes are queued reliably. Additions may race with deletions, but we'll check again in the shrinker. If additions race with the shrinker itself, we're protected by the i_lock: if find_inode() or iput() win, the shrinker will bail on the elevated i_count or I_REFERENCED; if the shrinker wins and goes ahead with the inode, it will set I_FREEING and inhibit further igets(), which will cause the other side to create a new instance of the inode instead. Link: https://lkml.kernel.org/r/20210614211904.14420-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Roman Gushchin Cc: Tejun Heo Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/inode.c | 46 +++++++++++++++++++++---------------- fs/internal.h | 1 - include/linux/fs.h | 1 + include/linux/pagemap.h | 50 +++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 8 +++++++ mm/truncate.c | 19 ++++++++++++++-- mm/vmscan.c | 7 ++++++ mm/workingset.c | 10 +++++++++ 8 files changed, 120 insertions(+), 22 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 8830a727b0af0..6b74701c19547 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -424,11 +424,20 @@ void ihold(struct inode *inode) } EXPORT_SYMBOL(ihold); -static void inode_lru_list_add(struct inode *inode) +static void __inode_add_lru(struct inode *inode, bool rotate) { + if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + return; + if (atomic_read(&inode->i_count)) + return; + if (!(inode->i_sb->s_flags & SB_ACTIVE)) + return; + if (!mapping_shrinkable(&inode->i_data)) + return; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - else + else if (rotate) inode->i_state |= I_REFERENCED; } @@ -439,16 +448,11 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | - I_FREEING | I_WILL_FREE)) && - !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) - inode_lru_list_add(inode); + __inode_add_lru(inode, false); } - static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -724,10 +728,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) /* * Isolate the inode from the LRU in preparation for freeing it. * - * Any inodes which are pinned purely because of attached pagecache have their - * pagecache removed. If the inode has metadata buffers attached to - * mapping->private_list then try to remove them. - * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another @@ -743,31 +743,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item, struct inode *inode = container_of(item, struct inode, i_lru); /* - * we are inverting the lru lock/inode->i_lock here, so use a trylock. - * If we fail to get the lock, just skip it. + * We are inverting the lru lock/inode->i_lock here, so use a + * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* - * Referenced or dirty inodes are still in use. Give them another pass - * through the LRU as we canot reclaim them now. + * Inodes can get referenced, redirtied, or repopulated while + * they're already on the LRU, and this can make them + * unreclaimable for a while. Remove them lazily here; iput, + * sync, or the last page cache deletion will requeue them. */ if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { + (inode->i_state & ~I_REFERENCED) || + !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } - /* recently referenced inodes get one more pass */ + /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } + /* + * On highmem systems, mapping_shrinkable() permits dropping + * page cache in order to free up struct inodes: lowmem might + * be under pressure before the cache inside the highmem zone. + */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { __iget(inode); spin_unlock(&inode->i_lock); @@ -1634,7 +1642,7 @@ static void iput_final(struct inode *inode) if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { - inode_add_lru(inode); + __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } diff --git a/fs/internal.h b/fs/internal.h index 82e8eb32ff3dd..693a88430da65 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -145,7 +145,6 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216c..8057699362df8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3219,6 +3219,7 @@ static inline void remove_inode_hash(struct inode *inode) } extern void inode_sb_list_add(struct inode *inode); +extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed02aa5222634..b071babc66299 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -23,6 +23,56 @@ static inline bool mapping_empty(struct address_space *mapping) return xa_empty(&mapping->i_pages); } +/* + * mapping_shrinkable - test if page cache state allows inode reclaim + * @mapping: the page cache mapping + * + * This checks the mapping's cache state for the pupose of inode + * reclaim and LRU management. + * + * The caller is expected to hold the i_lock, but is not required to + * hold the i_pages lock, which usually protects cache state. That's + * because the i_lock and the list_lru lock that protect the inode and + * its LRU state don't nest inside the irq-safe i_pages lock. + * + * Cache deletions are performed under the i_lock, which ensures that + * when an inode goes empty, it will reliably get queued on the LRU. + * + * Cache additions do not acquire the i_lock and may race with this + * check, in which case we'll report the inode as shrinkable when it + * has cache pages. This is okay: the shrinker also checks the + * refcount and the referenced bit, which will be elevated or set in + * the process of adding new cache pages to an inode. + */ +static inline bool mapping_shrinkable(struct address_space *mapping) +{ + void *head; + + /* + * On highmem systems, there could be lowmem pressure from the + * inodes before there is highmem pressure from the page + * cache. Make inodes shrinkable regardless of cache state. + */ + if (IS_ENABLED(CONFIG_HIGHMEM)) + return true; + + /* Cache completely empty? Shrink away. */ + head = rcu_access_pointer(mapping->i_pages.xa_head); + if (!head) + return true; + + /* + * The xarray stores single offset-0 entries directly in the + * head pointer, which allows non-resident page cache entries + * to escape the shadow shrinker's list of xarray nodes. The + * inode shrinker needs to pick them up under memory pressure. + */ + if (!xa_is_node(head) && xa_is_value(head)) + return true; + + return false; +} + /* * Bits in mapping->flags. */ diff --git a/mm/filemap.c b/mm/filemap.c index 3032fcdf326dd..c2fd42d1e81be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -260,9 +260,13 @@ void delete_from_page_cache(struct page *page) struct address_space *mapping = page_mapping(page); BUG_ON(!PageLocked(page)); + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __delete_from_page_cache(page, NULL); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); page_cache_free_page(mapping, page); } @@ -338,6 +342,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); @@ -346,6 +351,9 @@ void delete_from_page_cache_batch(struct address_space *mapping, } page_cache_delete_batch(mapping, pvec); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); diff --git a/mm/truncate.c b/mm/truncate.c index 787b35f2cdc16..9cc0a638c6447 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -45,9 +45,13 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); } /* @@ -73,8 +77,10 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; dax = dax_mapping(mapping); - if (!dax) + if (!dax) { + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); + } for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -93,8 +99,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, __clear_shadow_entry(mapping, index, page); } - if (!dax) + if (!dax) { xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + } pvec->nr = j; } @@ -566,6 +576,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); if (PageDirty(page)) goto failed; @@ -573,6 +584,9 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -581,6 +595,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: xa_unlock_irq(&mapping->i_pages); + spin_unlock(&mapping->host->i_lock); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 268ad6570751b..888e284418c07 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1058,6 +1058,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); + if (!PageSwapCache(page)) + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. @@ -1126,6 +1128,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); if (freepage != NULL) freepage(page); @@ -1135,6 +1140,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: xa_unlock_irq(&mapping->i_pages); + if (!PageSwapCache(page)) + spin_unlock(&mapping->host->i_lock); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index 5ba3e42446fa6..cf8bb34fb321b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -542,6 +542,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } + list_lru_isolate(lru, item); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); @@ -561,6 +568,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, out_invalid: xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); ret = LRU_REMOVED_RETRY; out: cond_resched(); From e9eb0348e0f1031b9e3f77d2be8a6ef1c6b3520e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 24 Aug 2021 09:59:08 +1000 Subject: [PATCH 078/365] writeback: memcg: simplify cgroup_writeback_by_id Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty pages for a memcg. However mem_cgroup_wb_stats() does a lot more than just get the number of dirty pages. Just directly get the number of dirty pages instead of calling mem_cgroup_wb_stats(). Also cgroup_writeback_by_id() is only called for best-effort dirty flushing, so remove the unused 'nr' parameter and no need to explicitly flush memcg stats. Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fs-writeback.c | 20 +++++++++----------- include/linux/memcontrol.h | 15 +++++++++++++++ include/linux/writeback.h | 2 +- mm/memcontrol.c | 13 +------------ 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 867984e778c3d..35894a2dba756 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1039,20 +1039,20 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; + unsigned long dirty; int ret; /* lookup bdi and memcg */ @@ -1081,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, } /* - * If @nr is zero, the caller is attempting to write out most of + * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. + * + * BTW the memcg stats are flushed periodically and this is best-effort + * estimation, so some potential error is ok. */ - if (!nr) { - unsigned long filepages, headroom, dirty, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, - &writeback); - nr = dirty * 10 / 8; - } + dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); + dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { - work->nr_pages = nr; + work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24797929d8a1f..3403ec77528ac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -955,6 +955,16 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { @@ -1391,6 +1401,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cbaef099645ec..aeda2c0c9986a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); bool cleanup_offline_cgwb(struct bdi_writeback *wb); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 702a81dfe72dc..1047f0271ff8a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -645,17 +645,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } -/* idx can be of type enum memcg_stat_item or node_stat_item. */ -static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -4668,7 +4657,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) atomic_read(&frn->done.cnt) == 1) { frn->at = 0; trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, WB_REASON_FOREIGN_FLUSH, &frn->done); } From 4192487b16c54f147bb1c232942400317e27d410 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:08 +1000 Subject: [PATCH 079/365] mm: gup: remove set but unused local variable major Patch series "Cleanups and fixup for gup". This series contains cleanups to remove unneeded variable, useless BUG_ON and use helper to improve readability. Also we fix a potential pgmap refcnt leak. More details can be found in the respective changelogs. This patch (of 5): Since commit a2beb5f1efed ("mm: clean up the last pieces of page fault accountings"), the local variable major is unused. Remove it. Link: https://lkml.kernel.org/r/20210807093620.21347-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210807093620.21347-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index b94717977d178..2532ec89f61b3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1276,7 +1276,7 @@ int fixup_user_fault(struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_fault_t ret, major = 0; + vm_fault_t ret; address = untagged_addr(address); @@ -1296,7 +1296,6 @@ int fixup_user_fault(struct mm_struct *mm, return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); - major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); From 63a40a64f8150b9943117aac7299f8921bdbcef6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:08 +1000 Subject: [PATCH 080/365] mm: gup: remove unneed local variable orig_refs Remove unneed local variable orig_refs since refs is unchanged now. Link: https://lkml.kernel.org/r/20210807093620.21347-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: Claudio Imbrenda Reviewed-by: David Hildenbrand Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 2532ec89f61b3..82074e819a8d4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -117,8 +117,6 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, if (flags & FOLL_GET) return try_get_compound_head(page, refs); else if (flags & FOLL_PIN) { - int orig_refs = refs; - /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow @@ -150,7 +148,7 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, - orig_refs); + refs); return page; } From f879764f3de6b4b933c72b8d003952e7bd248774 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:09 +1000 Subject: [PATCH 081/365] mm: gup: remove useless BUG_ON in __get_user_pages() Indeed, this BUG_ON couldn't catch anything useful. We are sure ret == 0 here because we would already bail out if ret != 0 and ret is untouched till here. Link: https://lkml.kernel.org/r/20210807093620.21347-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 82074e819a8d4..9f5b9a93ae21f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1149,7 +1149,6 @@ static long __get_user_pages(struct mm_struct *mm, * We must stop here. */ BUG_ON(gup_flags & FOLL_NOWAIT); - BUG_ON(ret != 0); goto out; } continue; From 7912ac5e0be88bbd9b4dc997588e04b42a890642 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:09 +1000 Subject: [PATCH 082/365] mm: gup: fix potential pgmap refcnt leak in __gup_device_huge() When failed to try_grab_page, put_dev_pagemap() is missed. So pgmap refcnt will leak in this case. Also we remove the check for pgmap against NULL as it's also checked inside the put_dev_pagemap(). Link: https://lkml.kernel.org/r/20210807093620.21347-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Reviewed-by: John Hubbard Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 9f5b9a93ae21f..94f886562688c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2253,14 +2253,14 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); + put_dev_pagemap(pgmap); return 0; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); - if (pgmap) - put_dev_pagemap(pgmap); + put_dev_pagemap(pgmap); return 1; } From 7c989419b81e353467dc489bb8185a075874862b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:09 +1000 Subject: [PATCH 083/365] mm-gup-fix-potential-pgmap-refcnt-leak-in-__gup_device_huge-fix simplify, cleanup Cc: Miaohe Lin Cc: Claudio Imbrenda Cc: Kirill A. Shutemov Cc: Jan Kara Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 94f886562688c..41fd86ec362c0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2247,14 +2247,13 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - put_dev_pagemap(pgmap); - return 0; + break; } (*nr)++; pfn++; From 77dc947e0e0fc039abd6a19c36dcce75003e5b37 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:09 +1000 Subject: [PATCH 084/365] mm-gup-fix-potential-pgmap-refcnt-leak-in-__gup_device_huge-fix-fix fix return value Reviewed-by: John Hubbard Cc: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 41fd86ec362c0..a10c48ef613c9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2240,6 +2240,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; + int ret = 1; do { struct page *page = pfn_to_page(pfn); @@ -2247,12 +2248,14 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); + ret = 0; break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); + ret = 0; break; } (*nr)++; @@ -2260,7 +2263,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); - return 1; + return ret; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, From 092792c3d8f24253b5507656378f57b7da7fbd2b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:09 +1000 Subject: [PATCH 085/365] mm: gup: use helper PAGE_ALIGNED in populate_vma_page_range() Use helper PAGE_ALIGNED to check if address is aligned to PAGE_SIZE. Minor readability improvement. Link: https://lkml.kernel.org/r/20210807093620.21347-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a10c48ef613c9..1973bfe59b2ed 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1471,8 +1471,8 @@ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); From 4898d4388f16bfba42f2c07eebcb04a777c10da5 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 24 Aug 2021 09:59:10 +1000 Subject: [PATCH 086/365] mm/gup: documentation corrections for gup/pup Patch series "A few gup refactorings and documentation updates", v3. While reviewing some of the other things going on around gup.c, I noticed that the documentation was wrong for a few of the routines that I wrote. And then I noticed that there was some significant code duplication too. So this fixes those issues. This is not entirely risk-free, but after looking closely at this, I think it's actually a useful improvement, getting rid of the code duplication here. This patch (of 3): The documentation for try_grab_compound_head() and try_grab_page() has fallen a little out of date. Update and clarify a few points. Also make it kerneldoc-correct, by adding @args documentation. Link: https://lkml.kernel.org/r/20210813044133.1536842-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20210813044133.1536842-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 1973bfe59b2ed..26ce6bb52044d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -92,10 +92,17 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) return head; } -/* +/** * try_grab_compound_head() - attempt to elevate a page's refcount, by a * flags-dependent amount. * + * Even though the name includes "compound_head", this function is still + * appropriate for callers that have a non-compound @page to get. + * + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the page's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * @@ -103,8 +110,14 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * - * FOLL_GET: page's refcount will be incremented by 1. - * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * FOLL_GET: page's refcount will be incremented by @refs. + * + * FOLL_PIN on compound pages that are > two pages long: page's refcount will + * be incremented by @refs, and page[2].hpage_pinned_refcount will be + * incremented by @refs * GUP_PIN_COUNTING_BIAS. + * + * FOLL_PIN on normal pages, or compound pages that are two pages long: + * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. * * Return: head page (with refcount appropriately incremented) for success, or * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's @@ -141,6 +154,8 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, * * However, be sure to *also* increment the normal page refcount * field at least once, so that the page really is pinned. + * That's why the refcount from the earlier + * try_get_compound_head() is left intact. */ if (hpage_pincount_available(page)) hpage_pincount_add(page, refs); @@ -184,10 +199,8 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) * @flags: gup flags: these are the FOLL_* flag values. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: - * - * FOLL_GET: page's refcount will be incremented by 1. - * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * time. Cases: please see the try_grab_compound_head() documentation, with + * "refs=1". * * Return: true for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or From 5da0b9c43c4e8f53e6e281a313ade16dffeec6b4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 24 Aug 2021 09:59:10 +1000 Subject: [PATCH 087/365] mm/gup: small refactoring: simplify try_grab_page() try_grab_page() does the same thing as try_grab_compound_head(..., refs=1, ...), just with a different API. So there is a lot of code duplication there. Change try_grab_page() to call try_grab_compound_head(), while keeping the API contract identical for callers. Also, now that try_grab_compound_head() always has a caller, remove the __maybe_unused annotation. Link: https://lkml.kernel.org/r/20210813044133.1536842-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 4 ++-- mm/gup.c | 35 +++++------------------------------ 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a8..af4845e81b843 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1214,8 +1214,8 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); -__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, - unsigned int flags); +struct page *try_grab_compound_head(struct page *page, int refs, + unsigned int flags); static inline __must_check bool try_get_page(struct page *page) diff --git a/mm/gup.c b/mm/gup.c index 26ce6bb52044d..d60419ed9262f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -124,8 +124,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -__maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); @@ -208,35 +208,10 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); + if (!(flags & (FOLL_GET | FOLL_PIN))) + return true; - if (flags & FOLL_GET) - return try_get_page(page); - else if (flags & FOLL_PIN) { - int refs = 1; - - page = compound_head(page); - - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_add(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - /* - * Similar to try_grab_compound_head(): even if using the - * hpage_pincount_add/_sub() routines, be sure to - * *also* increment the normal page refcount field at least - * once, so that the page really is pinned. - */ - page_ref_add(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); - } - - return true; + return try_grab_compound_head(page, 1, flags); } /** From 524bda3f3ac82009d8ab7ae8d0f7b480712094ad Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 24 Aug 2021 09:59:10 +1000 Subject: [PATCH 088/365] mm/gup: remove try_get_page(), call try_get_compound_head() directly try_get_page() is very similar to try_get_compound_head(), and in fact try_get_page() has fallen a little behind in terms of maintenance: try_get_compound_head() handles speculative page references more thoroughly. There are only two try_get_page() callsites, so just call try_get_compound_head() directly from those, and remove try_get_page() entirely. Also, seeing as how this changes try_get_compound_head() into a non-static function, provide some kerneldoc documentation for it. Link: https://lkml.kernel.org/r/20210813044133.1536842-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/s390/mm/fault.c | 2 +- fs/pipe.c | 2 +- include/linux/mm.h | 10 +--------- mm/gup.c | 21 +++++++++++++++++---- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e33c43b38afe0..81d7607499879 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -817,7 +817,7 @@ void do_secure_storage_access(struct pt_regs *regs) break; case KERNEL_FAULT: page = phys_to_page(addr); - if (unlikely(!try_get_page(page))) + if (unlikely(!try_get_compound_head(page, 1))) break; rc = arch_make_page_accessible(page); put_page(page); diff --git a/fs/pipe.c b/fs/pipe.c index 678dee2a8228d..687d1c2aabd6c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - return try_get_page(buf->page); + return try_get_compound_head(buf->page, 1); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/include/linux/mm.h b/include/linux/mm.h index af4845e81b843..d3439dd4f4bab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1217,15 +1217,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags); struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags); - -static inline __must_check bool try_get_page(struct page *page) -{ - page = compound_head(page); - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - page_ref_inc(page); - return true; -} +struct page *try_get_compound_head(struct page *page, int refs); static inline void put_page(struct page *page) { diff --git a/mm/gup.c b/mm/gup.c index d60419ed9262f..76ac930f1dd11 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -62,11 +62,24 @@ static void put_page_refs(struct page *page, int refs) put_page(page); } -/* - * Return the compound head page with ref appropriately incremented, - * or NULL if that failed. +/** + * try_get_compound_head() - return the compound head page with refcount + * appropriately incremented, or NULL if that failed. + * + * This handles potential refcount overflow correctly. It also works correclty + * for various lockless get_user_pages()-related callers, due to the use of + * page_cache_add_speculative(). + * + * Even though the name includes "compound_head", this function is still + * appropriate for callers that have a non-compound @page to get. + * + * @page: pointer to page to be gotten + * @refs: the value to add to the page's refcount + * + * Return: head page (with refcount appropriately incremented) for success, or + * NULL upon failure. */ -static inline struct page *try_get_compound_head(struct page *page, int refs) +struct page *try_get_compound_head(struct page *page, int refs) { struct page *head = compound_head(page); From de5aa46c1d3dbce676ed95e71279279010bb7940 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:10 +1000 Subject: [PATCH 089/365] fs, mm: fix race in unlinking swapfile We had a recurring situation in which admin procedures setting up swapfiles would race with test preparation clearing away swapfiles; and just occasionally that got stuck on a swapfile "(deleted)" which could never be swapped off. That is not supposed to be possible. 2.6.28 commit f9454548e17c ("don't unlink an active swapfile") admitted that it was leaving a race window open: now close it. may_delete() makes the IS_SWAPFILE check (amongst many others) before inode_lock has been taken on target: now repeat just that simple check in vfs_unlink() and vfs_rename(), after taking inode_lock. Which goes most of the way to fixing the race, but swapon() must also check after it acquires inode_lock, that the file just opened has not already been unlinked. Link: https://lkml.kernel.org/r/e17b91ad-a578-9a15-5e3-4989e0f999b5@google.com Fixes: f9454548e17c ("don't unlink an active swapfile") Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namei.c | 8 +++++++- mm/swapfile.c | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index bf6d8a738c599..ff866c07f4d2b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4024,7 +4024,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, return -EPERM; inode_lock(target); - if (is_local_mountpoint(dentry)) + if (IS_SWAPFILE(target)) + error = -EPERM; + else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); @@ -4526,6 +4528,10 @@ int vfs_rename(struct renamedata *rd) else if (target) inode_lock(target); + error = -EPERM; + if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) + goto out; + error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e07d1c776f2a..7527afd95284e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3130,6 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; + struct dentry *dentry; int prio; int error; union swap_header *swap_header; @@ -3173,6 +3174,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; + dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(p, inode); @@ -3180,6 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; inode_lock(inode); + if (d_unlinked(dentry) || cant_mount(dentry)) { + error = -ENOENT; + goto bad_swap_unlock_inode; + } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; From e28294a90b2a602807d19f385fd7c1a1deac6933 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 24 Aug 2021 09:59:10 +1000 Subject: [PATCH 090/365] mm: delete unused get_kernel_page() get_kernel_page() was added in 2012 by [1]. It was used for a while for NFS, but then in 2014, a refactoring [2] removed all callers, and it has apparently not been used since. Remove get_kernel_page() because it has no callers. [1] commit 18022c5d8627 ("mm: add get_kernel_page[s] for pinning of kernel addresses for I/O") [2] commit 91f79c43d1b5 ("new helper: iov_iter_get_pages_alloc()") Link: https://lkml.kernel.org/r/20210729221847.1165665-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Rik van Riel Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 1 - mm/swap.c | 22 ---------------------- 2 files changed, 23 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d3439dd4f4bab..35bbac32b6f69 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1839,7 +1839,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); diff --git a/mm/swap.c b/mm/swap.c index 19600430e536b..897200d27dd07 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,28 +179,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, } EXPORT_SYMBOL_GPL(get_kernel_pages); -/* - * get_kernel_page() - pin a kernel page in memory - * @start: starting kernel address - * @write: pinning for read/write, currently ignored - * @pages: array that receives pointer to the page pinned. - * Must be at least nr_segs long. - * - * Returns 1 if page is pinned. If the page was not pinned, returns - * -errno. The page returned must be released with a put_page() call - * when it is finished with. - */ -int get_kernel_page(unsigned long start, int write, struct page **pages) -{ - const struct kvec kiov = { - .iov_base = (void *)start, - .iov_len = PAGE_SIZE - }; - - return get_kernel_pages(&kiov, 1, write, pages); -} -EXPORT_SYMBOL_GPL(get_kernel_page); - static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { From 055daea20f227766a680c27094e1885178820ae7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 24 Aug 2021 09:59:11 +1000 Subject: [PATCH 091/365] shmem: use raw_spinlock_t for ->stat_lock Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is per-CPU. Access here is serialized by disabling preemption. If the pool is empty, it gets reloaded from `->next_ino'. Access here is serialized by ->stat_lock which is a spinlock_t and can not be acquired with disabled preemption. One way around it would make per-CPU ino_batch struct containing the inode number a local_lock_t. Another solution is to promote ->stat_lock to a raw_spinlock_t. The critical sections are short. The mpol_put() must be moved outside of the critical section to avoid invoking the destructor with disabled preemption. Link: https://lkml.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/shmem_fs.h | 2 +- mm/shmem.c | 31 +++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 8e775ce517bb3..0a8499fb9c3cb 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -31,7 +31,7 @@ struct shmem_sb_info { struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ - spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ diff --git a/mm/shmem.c b/mm/shmem.c index dacda7463d549..7eec13e39ac3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; @@ -304,7 +304,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) } *inop = ino; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it @@ -319,13 +319,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) * to worry about things like glibc compatibility. */ ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } @@ -341,9 +342,9 @@ static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } } @@ -1453,10 +1454,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } @@ -3488,9 +3489,10 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; + struct mempolicy *mpol = NULL; const char *err; - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3535,14 +3537,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { - mpol_put(sbinfo->mpol); + mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); return 0; out: - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } @@ -3659,7 +3662,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); + raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); From 416528dd9b4fca366e7bf215ef500bedb2a11399 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:11 +1000 Subject: [PATCH 092/365] shmem: remove unneeded variable ret Patch series "Cleanups for shmem". This series contains cleanups to remove unneeded variable, header file, function forward declaration and so on. More details can be found in the respective changelogs. This patch (of 4): The local variable ret is always equal to -ENOMEM and never touched. So remove it and return -ENOMEM directly to simplify the code. Link: https://lkml.kernel.org/r/20210812120350.49801-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210812120350.49801-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7eec13e39ac3b..e28d2593ec271 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3616,7 +3616,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; - int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), @@ -3694,7 +3693,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return err; + return -ENOMEM; } static int shmem_get_tree(struct fs_context *fc) From 49e40c4ed00878313f5f5f28460e75cbfbb5f0fc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:11 +1000 Subject: [PATCH 093/365] shmem: remove unneeded header file mfill_atomic_install_pte() is introduced to install pte and update mmu cache since commit bf6ebd97aba0 ("userfaultfd/shmem: modify shmem_mfill_atomic_pte to use install_pte()"). So we should remove tlbflush.h as update_mmu_cache() is not called here now. Link: https://lkml.kernel.org/r/20210812120350.49801-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e28d2593ec271..c69adf9a2db33 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -39,8 +39,6 @@ #include #include -#include /* for arch/microblaze update_mmu_cache() */ - static struct vfsmount *shm_mnt; #ifdef CONFIG_SHMEM From c5da862beb8f694b55c4cd9ae617c9407ccf6224 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:11 +1000 Subject: [PATCH 094/365] shmem: remove unneeded function forward declaration The forward declaration for shmem_should_replace_page() and shmem_replace_page() is unnecessary. Remove them. Link: https://lkml.kernel.org/r/20210812120350.49801-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c69adf9a2db33..c4087f70c86cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -135,9 +135,6 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static bool shmem_should_replace_page(struct page *page, gfp_t gfp); -static int shmem_replace_page(struct page **pagep, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index); static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, From a7244343784bcb823f0165e629b33a7f4b92ccea Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:11 +1000 Subject: [PATCH 095/365] shmem: include header file to declare swap_info It's bad to extern swap_info[] in .c. Include corresponding header file instead. Link: https://lkml.kernel.org/r/20210812120350.49801-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c4087f70c86cb..f841c7adb8b2d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include #include #include +#include static struct vfsmount *shm_mnt; @@ -1152,8 +1153,6 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -extern struct swap_info_struct *swap_info[]; - static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, From 21c544e04c01f6a9f3b424b8a6e886d5e069fc99 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:12 +1000 Subject: [PATCH 096/365] huge tmpfs: fix fallocate(vanilla) advance over huge pages Patch series "huge tmpfs: shmem_is_huge() fixes and cleanups". A series of huge tmpfs fixes and cleanups. This patch (of 9): shmem_fallocate() goes to a lot of trouble to leave its newly allocated pages !Uptodate, partly to identify and undo them on failure, partly to leave the overhead of clearing them until later. But the huge page case did not skip to the end of the extent, walked through the tail pages one by one, and appeared to work just fine: but in doing so, cleared and Uptodated the huge page, so there was no way to undo it on failure. And by setting Uptodate too soon, it messed up both its nr_falloced and nr_unswapped counts, so that the intended "time to give up" heuristic did not work at all. Now advance immediately to the end of the huge extent, with a comment on why this is more than just an optimization. But although this speeds up huge tmpfs fallocation, it does leave the clearing until first use, and some users may have come to appreciate slow fallocate but fast first use: if they complain, then we can consider adding a pass to clear at the end. Link: https://lkml.kernel.org/r/da632211-8e3e-6b1-aee-ab24734429a0@google.com Link: https://lkml.kernel.org/r/16201bd2-70e-37e2-e89b-5f929430da@google.com Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: Shakeel Butt Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Mike Kravetz Cc: Michal Hocko Cc: Rik van Riel Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f841c7adb8b2d..9ef579f6cab33 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2719,7 +2719,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); - for (index = start; index < end; index++) { + for (index = start; index < end; ) { struct page *page; /* @@ -2742,13 +2742,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } + index++; + /* + * Here is a more important optimization than it appears: + * a second SGP_FALLOC on the same huge page will clear it, + * making it PageUptodate and un-undoable if we fail later. + */ + if (PageTransCompound(page)) { + index = round_up(index, HPAGE_PMD_NR); + /* Beware 32-bit wraparound */ + if (!index) + index--; + } + /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - shmem_falloc.next++; if (!PageUptodate(page)) - shmem_falloc.nr_falloced++; + shmem_falloc.nr_falloced += index - shmem_falloc.next; + shmem_falloc.next = index; /* * If !PageUptodate, leave it that way so that freeable pages From 59a90bcd2bb42bec77be0766809f760da95b0d5c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:12 +1000 Subject: [PATCH 097/365] huge tmpfs: fix split_huge_page() after FALLOC_FL_KEEP_SIZE A successful shmem_fallocate() guarantees that the extent has been reserved, even beyond i_size when the FALLOC_FL_KEEP_SIZE flag was used. But that guarantee is broken by shmem_unused_huge_shrink()'s attempts to split huge pages and free their excess beyond i_size; and by other uses of split_huge_page() near i_size. It's sad to add a shmem inode field just for this, but I did not find a better way to keep the guarantee. A flag to say KEEP_SIZE has been used would be cheaper, but I'm averse to unclearable flags. The fallocend field is not perfect either (many disjoint ranges might be fallocated), but good enough; and gains another use later on. Link: https://lkml.kernel.org/r/ca9a146-3a59-6cd3-7f28-e9a044bb1052@google.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/shmem_fs.h | 13 +++++++++++++ mm/huge_memory.c | 6 ++++-- mm/shmem.c | 15 ++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0a8499fb9c3cb..bfc5899d18e00 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -18,6 +18,7 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ + pgoff_t fallocend; /* highest fallocate endindex */ struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ @@ -119,6 +120,18 @@ static inline bool shmem_file(struct file *file) return shmem_mapping(file->f_mapping); } +/* + * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages + * beyond i_size's notion of EOF, which fallocate has committed to reserving: + * which split_huge_page() must therefore not delete. This use of a single + * "fallocend" per inode errs on the side of not deleting a reservation when + * in doubt: there are plenty of cases when it preserves unreserved pages. + */ +static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) +{ + return max(eof, SHMEM_I(inode)->fallocend); +} + extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index afff3ac870673..890fb73ac89bc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2454,11 +2454,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); - /* Some pages can be beyond i_size: drop them from page cache */ + /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { ClearPageDirty(head + i); __delete_from_page_cache(head + i, NULL); - if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) + if (shmem_mapping(head->mapping)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); } else if (!PageAnon(page)) { @@ -2686,6 +2686,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * head page lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (shmem_mapping(mapping)) + end = shmem_fallocend(mapping->host, end); } /* diff --git a/mm/shmem.c b/mm/shmem.c index 9ef579f6cab33..e391325dfc214 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -902,6 +902,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; + pagevec_init(&pvec); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, @@ -2650,7 +2653,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; - pgoff_t start, index, end; + pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2719,6 +2722,15 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + for (index = start; index < end; ) { struct page *page; @@ -2733,6 +2745,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { + info->fallocend = undo_fallocend; /* Remove the !PageUptodate pages we added */ if (index > start) { shmem_undo_range(inode, From e71bcf37883024dafa4b535c9b79d14626387cb3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:12 +1000 Subject: [PATCH 098/365] huge tmpfs: remove shrinklist addition from shmem_setattr() There's a block of code in shmem_setattr() to add the inode to shmem_unused_huge_shrink()'s shrinklist when lowering i_size: it dates from before 5.7 changed truncation to do split_huge_page() for itself, and should have been removed at that time. I am over-stating that: split_huge_page() can fail (notably if there's an extra reference to the page at that time), so there might be value in retrying. But there were already retries as truncation worked through the tails, and this addition risks repeating unsuccessful retries indefinitely: I'd rather remove it now, and work on reducing the chance of split_huge_page() failures separately, if we need to. Link: https://lkml.kernel.org/r/b73b3492-8822-18f9-83e2-938528cdde94@google.com Fixes: 71725ed10c40 ("mm: huge tmpfs: try to split_huge_page() when punching hole") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e391325dfc214..9403048f4ee10 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1058,7 +1058,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns, { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; error = setattr_prepare(&init_user_ns, dentry, attr); @@ -1094,24 +1093,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - - /* - * Part of the huge page can be beyond i_size: subject - * to shrink under memory pressure. - */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } } } From 8fd6fc8dd290542fc6726d06cbd0607ceacf309a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:12 +1000 Subject: [PATCH 099/365] huge tmpfs: revert shmem's use of transhuge_vma_enabled() 5.14 commit e6be37b2e7bd ("mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled()") added transhuge_vma_enabled() as a wrapper for two very different checks (one check is whether the app has marked its address range not to use THPs, the other check is whether the app is running in a hierarchy that has been marked never to use THPs). shmem_huge_enabled() prefers to show those two checks explicitly, as before. Link: https://lkml.kernel.org/r/45e5338-18d-c6f9-c17e-34f510bc1728@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9403048f4ee10..43cb1a99f3ce1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3987,7 +3987,8 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) loff_t i_size; pgoff_t off; - if (!transhuge_vma_enabled(vma, vma->vm_flags)) + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; From 231c38b26e0f1a600c57892729eb8f92da094b26 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:12 +1000 Subject: [PATCH 100/365] huge tmpfs: move shmem_huge_enabled() upwards shmem_huge_enabled() is about to be enhanced into shmem_is_huge(), so that it can be used more widely throughout: before making functional changes, shift it to its final position (to avoid forward declaration). Link: https://lkml.kernel.org/r/16fec7b7-5c84-415a-8586-69d8bf6a6685@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 72 ++++++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 43cb1a99f3ce1..2df6a5370cd7e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -473,6 +473,41 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly; +bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + loff_t i_size; + pgoff_t off; + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + switch (sbinfo->huge) { + case SHMEM_HUGE_NEVER: + return false; + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + return true; + fallthrough; + case SHMEM_HUGE_ADVISE: + /* TODO: implement fadvise() hints */ + return (vma->vm_flags & VM_HUGEPAGE); + default: + VM_BUG_ON(1); + return false; + } +} + #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { @@ -3979,43 +4014,6 @@ struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - loff_t i_size; - pgoff_t off; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - return false; - case SHMEM_HUGE_ALWAYS: - return true; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - return true; - fallthrough; - case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); - default: - VM_BUG_ON(1); - return false; - } -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #else /* !CONFIG_SHMEM */ /* From aa744ad9eef4c641ff214abef3e15d27d1bde7f3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:12 +1000 Subject: [PATCH 101/365] huge tmpfs: SGP_NOALLOC to stop collapse_file() on race khugepaged's collapse_file() currently uses SGP_NOHUGE to tell shmem_getpage() not to try allocating a huge page, in the very unlikely event that a racing hole-punch removes the swapped or fallocated page as soon as i_pages lock is dropped. We want to consolidate shmem's huge decisions, removing SGP_HUGE and SGP_NOHUGE; but cannot quite persuade ourselves that it's okay to regress the protection in this case - Yang Shi points out that the huge page would remain indefinitely, charged to root instead of the intended memcg. collapse_file() should not even allocate a small page in this case: why proceed if someone is punching a hole? SGP_READ is almost the right flag here, except that it optimizes away from a fallocated page, with NULL to tell caller to fill with zeroes (like a hole); whereas collapse_file()'s sequence relies on using a cache page. Add SGP_NOALLOC just for this. There are too many consecutive "if (page"s there in shmem_getpage_gfp(): group it better; and fix the outdated "bring it back from swap" comment. Link: https://lkml.kernel.org/r/1355343b-acf-4653-ef79-6aee40214ac5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/shmem_fs.h | 1 + mm/khugepaged.c | 2 +- mm/shmem.c | 29 +++++++++++++++++------------ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index bfc5899d18e00..a3f4502ec8a9d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,6 +94,7 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0412be08fa2c..045cc579f724e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1721,7 +1721,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/shmem.c b/mm/shmem.c index 2df6a5370cd7e..867cb404090a7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1854,26 +1854,31 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; } - if (page) + if (page) { hindex = page->index; - if (page && sgp == SGP_WRITE) - mark_page_accessed(page); - - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (sgp == SGP_WRITE) + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + /* fallocated page */ if (sgp != SGP_READ) goto clear; unlock_page(page); put_page(page); - page = NULL; - hindex = index; } - if (page || sgp == SGP_READ) - goto out; /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * SGP_READ: succeed on hole, with NULL page, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + */ + *pagep = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; + + /* + * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { From 1be74d79c470b379761613868c4583761c3006cb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:13 +1000 Subject: [PATCH 102/365] huge tmpfs: shmem_is_huge(vma, inode, index) Extend shmem_huge_enabled(vma) to shmem_is_huge(vma, inode, index), so that a consistent set of checks can be applied, even when the inode is accessed through read/write syscalls (with NULL vma) instead of mmaps (the index argument is seldom of interest, but required by mount option "huge=within_size"). Clean up and rearrange the checks a little. This then replaces the checks which shmem_fault() and shmem_getpage_gfp() were making, and eliminates the SGP_HUGE and SGP_NOHUGE modes. Replace a couple of 0s by explicit SHMEM_HUGE_NEVERs; and replace the obscure !shmem_mapping() symlink check by explicit S_ISLNK() - nothing else needs that symlink check, so leave it there in shmem_getpage_gfp(). Link: https://lkml.kernel.org/r/23a77889-2ddc-b030-75cd-44ca27fd4d1@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/shmem_fs.h | 9 +++-- mm/shmem.c | 84 ++++++++++++---------------------------- 2 files changed, 31 insertions(+), 62 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a3f4502ec8a9d..166158b6e917a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -86,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); -extern bool shmem_huge_enabled(struct vm_area_struct *vma); +extern bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); +} extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -96,8 +101,6 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ - SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; diff --git a/mm/shmem.c b/mm/shmem.c index 867cb404090a7..69c9788a00945 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -471,39 +471,35 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly; +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_huge_enabled(struct vm_area_struct *vma) +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) { - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); loff_t i_size; - pgoff_t off; - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; if (shmem_huge == SHMEM_HUGE_DENY) return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + + switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: return true; case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + index = round_up(index, HPAGE_PMD_NR); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) + if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index) return true; fallthrough; case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; default: - VM_BUG_ON(1); return false; } } @@ -677,6 +673,12 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + return false; +} + static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { @@ -1812,7 +1814,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct page *page; - enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; gfp_t huge_gfp; int error; @@ -1821,8 +1822,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; - if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) - sgp = SGP_CACHE; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { @@ -1886,36 +1885,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return 0; } - /* shmem_symlink() */ - if (!shmem_mapping(mapping)) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + /* Never use a huge page for shmem_symlink() */ + if (S_ISLNK(inode->i_mode)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) - goto alloc_huge; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: + if (!shmem_is_huge(vma, inode, index)) goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: { - loff_t i_size; - pgoff_t off; - - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - fallthrough; - } - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } - -alloc_huge: huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); @@ -2071,7 +2046,6 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - enum sgp_type sgp; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2134,15 +2108,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - sgp = SGP_CACHE; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - sgp = SGP_NOHUGE; - else if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); @@ -3950,7 +3916,7 @@ int __init shmem_init(void) if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else - shmem_huge = 0; /* just in case it was patched */ + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif return 0; From c01e81c15869130780faa1568df6fe750bde0a57 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:13 +1000 Subject: [PATCH 103/365] huge tmpfs: decide stat.st_blksize by shmem_is_huge() 4.18 commit 89fdcd262fd4 ("mm: shmem: make stat.st_blksize return huge page size if THP is on") added is_huge_enabled() to decide st_blksize: if hugeness is to be defined per file, that will need to be replaced by shmem_is_huge(). This does give a different answer (No) for small files on a "huge=within_size" mount: but that can be considered a minor bugfix. And a different answer (No) for default files on a "huge=advise" mount: I'm reluctant to complicate it, just to reproduce the same debatable answer as before. Link: https://lkml.kernel.org/r/af7fb3f9-4415-9e8e-fdac-b1a5253ad21@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 69c9788a00945..69aa932c08da5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -686,15 +686,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) -{ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && - shmem_huge != SHMEM_HUGE_DENY) - return true; - return false; -} - /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -1075,7 +1066,6 @@ static int shmem_getattr(struct user_namespace *mnt_userns, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -1084,7 +1074,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, } generic_fillattr(&init_user_ns, inode, stat); - if (is_huge_enabled(sb_info)) + if (shmem_is_huge(NULL, inode, 0)) stat->blksize = HPAGE_PMD_SIZE; return 0; From 56cd66ffe5304a8a11042618215c2186ed9596db Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Aug 2021 09:59:13 +1000 Subject: [PATCH 104/365] shmem: shmem_writepage() split unlikely i915 THP drivers/gpu/drm/i915/gem/i915_gem_shmem.c contains a shmem_writeback() which calls shmem_writepage() from a shrinker: that usually works well enough; but if /sys/kernel/mm/transparent_hugepage/shmem_enabled has been set to "always" (intended to be usable) or "force" (forces huge everywhere for easy testing), shmem_writepage() is surprised to be called with a huge page, and crashes on the VM_BUG_ON_PAGE(PageCompound) (I did not find out where the crash happens when CONFIG_DEBUG_VM is off). LRU page reclaim always splits the shmem huge page first: I'd prefer not to demand that of i915, so check and split compound in shmem_writepage(). Patch history: when first sent last year http://lkml.kernel.org/r/alpine.LSU.2.11.2008301401390.5954@eggly.anvils https://lore.kernel.org/linux-mm/20200919042009.bomzxmrg7%25akpm@linux-foundation.org/ Matthew Wilcox noticed that tail pages were wrongly left clean. This version brackets the split with Set and Clear PageDirty as he suggested: which works very well, even if it falls short of our aspirations. And recently I realized that the crash is not limited to the testing option "force", but affects "always" too: which is more important to fix. Link: https://lkml.kernel.org/r/bac6158c-8b3d-4dca-cffc-4982f58d9794@google.com Fixes: 2d6692e642e7 ("drm/i915: Start writeback from the shrinker") Signed-off-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 69aa932c08da5..21c29f7e38943 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1344,7 +1344,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swp_entry_t swap; pgoff_t index; - VM_BUG_ON_PAGE(PageCompound(page), page); + /* + * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or + * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, + * and its shmem_writeback() needs them to be split when swapping. + */ + if (PageTransCompound(page)) { + /* Ensure the subpages are still dirty */ + SetPageDirty(page); + if (split_huge_page(page) < 0) + goto redirty; + ClearPageDirty(page); + } + BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; From 80bd10a19b99e5f3ef05fd999044268efa39a2d0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 24 Aug 2021 09:59:13 +1000 Subject: [PATCH 105/365] mm, memcg: add mem_cgroup_disabled checks in vmpressure and swap-related functions Add mem_cgroup_disabled check in vmpressure, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~2.1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n, CONFIG_MEMCG_SWAP=n} against {CONFIG_MEMCG=y, CONFIG_MEMCG_SWAP=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 3 +++ mm/swapfile.c | 3 +++ mm/vmpressure.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1047f0271ff8a..211f3911228f7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7297,6 +7297,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swapfile.c b/mm/swapfile.c index 7527afd95284e..627b16aed1dc8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3784,6 +3784,9 @@ void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) struct swap_info_struct *si, *next; int nid = page_to_nid(page); + if (mem_cgroup_disabled()) + return; + if (!(gfp_mask & __GFP_IO)) return; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d69019fc37898..9b172561fded7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -240,7 +240,12 @@ static void vmpressure_work_fn(struct work_struct *work) void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { - struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure *vmpr; + + if (mem_cgroup_disabled()) + return; + + vmpr = memcg_to_vmpressure(memcg); /* * Here we only want to account pressure that userland is able to From 3f0de3fa5ed59d81bfbab49fd80cc512b95691ae Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 24 Aug 2021 09:59:13 +1000 Subject: [PATCH 106/365] mm, memcg: inline mem_cgroup_{charge/uncharge} to improve disabled memcg config Inline mem_cgroup_{charge/uncharge} and mem_cgroup_uncharge_list functions functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~0.4% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 28 +++++++++++++++++++++++++--- mm/memcontrol.c | 33 ++++++++++++--------------------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3403ec77528ac..a00bf337567bc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -693,13 +693,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask); +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_charge(page, mm, gfp_mask); +} + int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void mem_cgroup_uncharge(struct page *page); -void mem_cgroup_uncharge_list(struct list_head *page_list); +void __mem_cgroup_uncharge(struct page *page); +static inline void mem_cgroup_uncharge(struct page *page) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge(page); +} + +void __mem_cgroup_uncharge_list(struct list_head *page_list); +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_list(page_list); +} void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 211f3911228f7..33bb8434eea05 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6693,8 +6693,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, - gfp_t gfp) +static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) { unsigned int nr_pages = thp_nr_pages(page); int ret; @@ -6715,7 +6714,7 @@ static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, } /** - * mem_cgroup_charge - charge a newly allocated page to a cgroup + * __mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode @@ -6728,16 +6727,14 @@ static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { struct mem_cgroup *memcg; int ret; - if (mem_cgroup_disabled()) - return 0; - memcg = get_mem_cgroup_from_mm(mm); - ret = __mem_cgroup_charge(page, memcg, gfp_mask); + ret = charge_memcg(page, memcg, gfp_mask); css_put(&memcg->css); return ret; @@ -6772,7 +6769,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = __mem_cgroup_charge(page, memcg, gfp); + ret = charge_memcg(page, memcg, gfp); css_put(&memcg->css); return ret; @@ -6908,18 +6905,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } /** - * mem_cgroup_uncharge - uncharge a page + * __mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_charge(). + * Uncharge a page previously charged with __mem_cgroup_charge(). */ -void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct page *page) { struct uncharge_gather ug; - if (mem_cgroup_disabled()) - return; - /* Don't touch page->lru of any random page, pre-check: */ if (!page_memcg(page)) return; @@ -6930,20 +6924,17 @@ void mem_cgroup_uncharge(struct page *page) } /** - * mem_cgroup_uncharge_list - uncharge a list of page + * __mem_cgroup_uncharge_list - uncharge a list of page * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_charge(). + * __mem_cgroup_charge(). */ -void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; struct page *page; - if (mem_cgroup_disabled()) - return; - uncharge_gather_clear(&ug); list_for_each_entry(page, page_list, lru) uncharge_page(page, &ug); From 9cc0b1127bbf1149ecf80efe9517aae62f4a4fe2 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 24 Aug 2021 09:59:14 +1000 Subject: [PATCH 107/365] mm, memcg: inline swap-related functions to improve disabled memcg config Inline mem_cgroup_try_charge_swap, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Yang Shi Cc: Alex Shi Cc: Wei Yang Cc: Jens Axboe Cc: Joonsoo Kim Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Alistair Popple Cc: Minchan Kim Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 26 +++++++++++++++++++++++--- mm/memcontrol.c | 14 ++++---------- mm/swapfile.c | 5 +---- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6f5a43251593c..f30d26b0f71db 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return; + __cgroup_throttle_swaprate(page, gfp_mask); +} #else static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { @@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); -extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); +static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_try_charge_swap(page, entry); +} + +extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_swap(entry, nr_pages); +} + extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33bb8434eea05..81e15a67391b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7226,7 +7226,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_try_charge_swap - try charging swap space for a page + * __mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * @@ -7234,16 +7234,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * * Returns 0 on success, -ENOMEM on failure. */ -int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { unsigned int nr_pages = thp_nr_pages(page); struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; - if (mem_cgroup_disabled()) - return 0; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; @@ -7279,18 +7276,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_uncharge_swap - uncharge swap space + * __mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; - if (mem_cgroup_disabled()) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swapfile.c b/mm/swapfile.c index 627b16aed1dc8..22d10f7138487 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3779,14 +3779,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; int nid = page_to_nid(page); - if (mem_cgroup_disabled()) - return; - if (!(gfp_mask & __GFP_IO)) return; From 1e2c711720d7882dd75d282d6cee75ba80565d55 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:14 +1000 Subject: [PATCH 108/365] memcg: enable accounting for pids in nested pid namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") enabled memcg accounting for pids allocated from init_pid_ns.pid_cachep, but forgot to adjust the setting for nested pid namespaces. As a result, pid memory is not accounted exactly where it is really needed, inside memcg-limited containers with their own pid namespaces. Pid was one the first kernel objects enabled for memcg accounting. init_pid_ns.pid_cachep marked by SLAB_ACCOUNT and we can expect that any new pids in the system are memcg-accounted. Though recently I've noticed that it is wrong. nested pid namespaces creates own slab caches for pid objects, nested pids have increased size because contain id both for all parent and for own pid namespaces. The problem is that these slab caches are _NOT_ marked by SLAB_ACCOUNT, as a result any pids allocated in nested pid namespaces are not memcg-accounted. Pid struct in nested pid namespace consumes up to 500 bytes memory, 100000 such objects gives us up to ~50Mb unaccounted memory, this allow container to exceed assigned memcg limits. Link: https://lkml.kernel.org/r/8b6de616-fd1a-02c6-cbdb-976ecdcfa604@virtuozzo.com Fixes: 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") Cc: stable@vger.kernel.org Signed-off-by: Vasily Averin Reviewed-by: Michal Koutný Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca43239a255ad..cb5a25a8a0cc7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) - *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); From c5e3a6e1a4fafcdd752999d86f945580297396df Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 24 Aug 2021 09:59:14 +1000 Subject: [PATCH 109/365] memcg: switch lruvec stats to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat") switched memcg stats to rstat infrastructure but skipped the conversion of the lruvec stats as such stats are read in the performance critical code paths and flushing stats may have impacted the performances of the applications. This patch converts the lruvec stats to rstat and later patches add mechanisms to keep the performance impact to minimum. The rstat conversion comes with the price i.e. memory cost. Effectively this patch reverts the savings done by the commit f3344adf38bd ("mm: memcontrol: optimize per-lruvec stats counter memory usage"). However this cost is justified due to negative impact of the inaccurate lruvec stats on many heuristics. One such case is reported in [1]. The memory reclaim code is filled with plethora of heuristics and many of those heuristics reads the lruvec stats. So, inaccurate stats can make such heuristics ineffective. [1] reports the impact of inaccurate lruvec stats on the "cache trim mode" heuristic. Inaccurate lruvec stats can impact the deactivation and aging anon heuristics as well. [1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/ Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Huang Ying Cc: Hillf Danton Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 42 +++++++------- mm/memcontrol.c | 114 +++++++++++++------------------------ 2 files changed, 58 insertions(+), 98 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a00bf337567bc..47d35bef9f6ee 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; -struct lruvec_stat { - long count[NR_VM_NODE_STAT_ITEMS]; -}; - -struct batched_lruvec_stat { - s32 count[NR_VM_NODE_STAT_ITEMS]; -}; - /* * Bitmap and deferred work of shrinker::id corresponding to memcg-aware * shrinkers, which have elements charged to this memcg. @@ -123,24 +115,30 @@ struct shrinker_info { unsigned long *map; }; +struct lruvec_stats_percpu { + /* Local (CPU and cgroup) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[NR_VM_NODE_STAT_ITEMS]; +}; + +struct lruvec_stats { + /* Aggregated (CPU and subtree) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[NR_VM_NODE_STAT_ITEMS]; +}; + /* * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; - /* - * Legacy local VM stats. This should be struct lruvec_stat and - * cannot be optimized to struct batched_lruvec_stat. Because - * the threshold of the lruvec_stat_cpu can be as big as - * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this - * filed has no upper limit. - */ - struct lruvec_stat __percpu *lruvec_stat_local; - - /* Subtree VM stats (batched updates) */ - struct batched_lruvec_stat __percpu *lruvec_stat_cpu; - atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; + struct lruvec_stats lruvec_stats; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat[idx]); + x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); + x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81e15a67391b2..400d210e030ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -660,23 +660,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -static struct mem_cgroup_per_node * -parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) -{ - struct mem_cgroup *parent; - - parent = parent_mem_cgroup(pn->memcg); - if (!parent) - return NULL; - return parent->nodeinfo[nid]; -} - void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - - if (vmstat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; - - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > threshold)) { - pg_data_t *pgdat = lruvec_pgdat(lruvec); - struct mem_cgroup_per_node *pi; - - for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) - atomic_long_add(x, &pi->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); } /** @@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) -{ - int nid; - - for_each_node(nid) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - unsigned long stat[NR_VM_NODE_STAT_ITEMS]; - struct batched_lruvec_stat *lstatc; - int i; - - lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - stat[i] = lstatc->count[i]; - lstatc->count[i] = 0; - } - - do { - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } -} - static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - for_each_mem_cgroup(memcg) - memcg_flush_lruvec_page_state(memcg, cpu); - return 0; } @@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_local) { - kfree(pn); - return 1; - } - - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_cpu) { - free_percpu(pn->lruvec_stat_local); + pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, + GFP_KERNEL_ACCOUNT); + if (!pn->lruvec_stats_percpu) { kfree(pn); return 1; } @@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return; - free_percpu(pn->lruvec_stat_cpu); - free_percpu(pn->lruvec_stat_local); + free_percpu(pn->lruvec_stats_percpu); kfree(pn); } @@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { - int cpu; - memcg_wb_domain_exit(memcg); - /* - * Flush percpu lruvec stats to guarantee the value - * correctness on parent's and all ancestor levels. - */ - for_each_online_cpu(cpu) - memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; long delta, v; - int i; + int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent) parent->vmstats.events_pending[i] += delta; } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats_percpu *lstatc; + + if (parent) + ppn = parent->nodeinfo[nid]; + + lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + delta = pn->lruvec_stats.state_pending[i]; + if (delta) + pn->lruvec_stats.state_pending[i] = 0; + + v = READ_ONCE(lstatc->state[i]); + if (v != lstatc->state_prev[i]) { + delta += v - lstatc->state_prev[i]; + lstatc->state_prev[i] = v; + } + + if (!delta) + continue; + + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } + } } #ifdef CONFIG_MMU @@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; From d9b24bea132ca480bc92798707e2120c6028b2de Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 24 Aug 2021 09:59:14 +1000 Subject: [PATCH 110/365] memcg: infrastructure to flush memcg stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment memcg stats are read in four contexts: 1. memcg stat user interfaces 2. dirty throttling 3. page fault 4. memory reclaim Currently the kernel flushes the stats for first two cases. Flushing the stats for remaining two casese may have performance impact. Always flushing the memcg stats on the page fault code path may negatively impacts the performance of the applications. In addition flushing in the memory reclaim code path, though treated as slowpath, can become the source of contention for the global lock taken for stat flushing because when system or memcg is under memory pressure, many tasks may enter the reclaim path. This patch uses following mechanisms to solve these challenges: 1. Periodically flush the stats from root memcg every 2 seconds. This will time limit the out of sync stats. 2. Asynchronously flush the stats after fixed number of stat updates. In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for 2 seconds. 3. For avoiding thundering herd to flush the stats particularly from the memory reclaim context, introduce memcg local spinlock and let only one flusher active at a time. This could have been done through cgroup_rstat_lock lock but that lock is used by other subsystem and for userspace reading memcg stats. So, it is better to keep flushers introduced by this patch decoupled from cgroup_rstat_lock. However we would have to use irqsafe version of rstat flush but that is fine as this code path will be flushing for whole tree and do the work for everyone. No one will be waiting for that worker. Link: https://lkml.kernel.org/r/20210714013948.270662-2-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Hillf Danton Cc: Huang Ying Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 34 ++++++++++++++++++++++++++++++++++ mm/vmscan.c | 6 ++++++ 3 files changed, 46 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 47d35bef9f6ee..c4c5b652d3afa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1023,6 +1023,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +void mem_cgroup_flush_stats(void); + void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); @@ -1438,6 +1440,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } +static inline void mem_cgroup_flush_stats(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 400d210e030ac..3dc6bc48b73f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,6 +103,14 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } +/* memcg and lruvec stats flushing */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static void flush_memcg_stats_work(struct work_struct *w); +static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work); +static DEFINE_PER_CPU(unsigned int, stats_flush_threshold); +static DEFINE_SPINLOCK(stats_flush_lock); + #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -674,6 +682,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH)) + queue_work(system_unbound_wq, &stats_flush_work); } /** @@ -5240,6 +5250,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); + + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); return 0; } @@ -5331,6 +5345,26 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +void mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush(root_mem_cgroup->css.cgroup); + spin_unlock(&stats_flush_lock); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + +static void flush_memcg_stats_work(struct work_struct *w) +{ + mem_cgroup_flush_stats(); +} + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); diff --git a/mm/vmscan.c b/mm/vmscan.c index 888e284418c07..b51fdd75d717c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2904,6 +2904,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; From e334c7ff1c432a929123573a5652171f177ec7ee Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 24 Aug 2021 09:59:14 +1000 Subject: [PATCH 111/365] memcg-infrastructure-to-flush-memcg-stats-v5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix sleep-in-wrong context bug Link: https://lkml.kernel.org/r/20210716212137.1391164-2-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: Yang Yingliang Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Stephen Rothwell Cc: Hillf Danton Cc: Huang Ying Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dc6bc48b73f5..4d8c9afecf985 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5350,7 +5350,7 @@ void mem_cgroup_flush_stats(void) if (!spin_trylock(&stats_flush_lock)) return; - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); spin_unlock(&stats_flush_lock); } From d5e0dff2214d0fb3ace8062e647f80a426426b1f Mon Sep 17 00:00:00 2001 From: Yutian Yang Date: Tue, 24 Aug 2021 09:59:15 +1000 Subject: [PATCH 112/365] memcg: charge fs_context and legacy_fs_context This patch adds accounting flags to fs_context and legacy_fs_context allocation sites so that kernel could correctly charge these objects. We have written a PoC to demonstrate the effect of the missing-charging bugs. The PoC takes around 1,200MB unaccounted memory, while it is charged for only 362MB memory usage. We evaluate the PoC on QEMU x86_64 v5.2.90 + Linux kernel v5.10.19 + Debian buster. All the limitations including ulimits and sysctl variables are set as default. Specifically, the hard NOFILE limit and nr_open in sysctl are both 1,048,576. /*------------------------- POC code ----------------------------*/ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ } while (0) #define STACK_SIZE (8 * 1024) #ifndef __NR_fsopen #define __NR_fsopen 430 #endif static inline int fsopen(const char *fs_name, unsigned int flags) { return syscall(__NR_fsopen, fs_name, flags); } static char thread_stack[512][STACK_SIZE]; int thread_fn(void* arg) { for (int i = 0; i< 800000; ++i) { int fsfd = fsopen("nfs", FSOPEN_CLOEXEC); if (fsfd == -1) { errExit("fsopen"); } } while(1); return 0; } int main(int argc, char *argv[]) { int thread_pid; for (int i = 0; i < 1; ++i) { thread_pid = clone(thread_fn, thread_stack[i] + STACK_SIZE, \ SIGCHLD, NULL); } while(1); return 0; } /*-------------------------- end --------------------------------*/ Link: https://lkml.kernel.org/r/1626517201-24086-1-git-send-email-nglaive@gmail.com Signed-off-by: Yutian Yang Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fs_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index de1985eae535f..b7e43a780a625 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct fs_context *fc; int ret = -ENOMEM; - fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); @@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = { */ static int legacy_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; From 659a7bdb513c60384ada59901ab3482e4b7d29c5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:15 +1000 Subject: [PATCH 113/365] memcg: enable accounting for mnt_cache entries Patch series "memcg accounting from OpenVZ", v7. OpenVZ uses memory accounting 20+ years since v2.2.x linux kernels. Initially we used our own accounting subsystem, then partially committed it to upstream, and a few years ago switched to cgroups v1. Now we're rebasing again, revising our old patches and trying to push them upstream. We try to protect the host system from any misuse of kernel memory allocation triggered by untrusted users inside the containers. Patch-set is addressed mostly to cgroups maintainers and cgroups@ mailing list, though I would be very grateful for any comments from maintainersi of affected subsystems or other people added in cc: Compared to the upstream, we additionally account the following kernel objects: - network devices and its Tx/Rx queues - ipv4/v6 addresses and routing-related objects - inet_bind_bucket cache objects - VLAN group arrays - ipv6/sit: ip_tunnel_prl - scm_fp_list objects used by SCM_RIGHTS messages of Unix sockets - nsproxy and namespace objects itself - IPC objects: semaphores, message queues and share memory segments - mounts - pollfd and select bits arrays - signals and posix timers - file lock - fasync_struct used by the file lease code and driver's fasync queues - tty objects - per-mm LDT We have an incorrect/incomplete/obsoleted accounting for few other kernel objects: sk_filter, af_packets, netlink and xt_counters for iptables. They require rework and probably will be dropped at all. Also we're going to add an accounting for nft, however it is not ready yet. We have not tested performance on upstream, however, our performance team compares our current RHEL7-based production kernel and reports that they are at least not worse as the according original RHEL7 kernel. This patch (of 10): The kernel allocates ~400 bytes of 'struct mount' for any new mount. Creating a new mount namespace clones most of the parent mounts, and this can be repeated many times. Additionally, each mount allocates up to PATH_MAX=4096 bytes for mnt->mnt_devname. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/045db11f-4a45-7c9b-2664-5b32c2b44943@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Cc: Tejun Heo Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Yutian Yang Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Serge Hallyn Cc: Thomas Gleixner Cc: Zefan Li Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namespace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 97adcb5ab5d55..e51b63ae233b5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, + GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } @@ -4240,7 +4241,7 @@ void __init mnt_init(void) int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), From e045404479675a1e9d6f1d28ce538e7a2ba1238d Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:15 +1000 Subject: [PATCH 114/365] memcg: enable accounting for pollfd and select bits arrays User can call select/poll system calls with a large number of assigned file descriptors and force kernel to allocate up to several pages of memory till end of these sleeping system calls. We have here long-living unaccounted per-task allocations. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/56e31cb5-6e1e-bdba-d7ca-be64b9842363@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/select.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/select.c b/fs/select.c index 945896d0ac9e7..e83e563a351db 100644 --- a/fs/select.c +++ b/fs/select.c @@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto out_nofds; alloc_size = 6 * size; - bits = kvmalloc(alloc_size, GFP_KERNEL); + bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT); if (!bits) goto out_nofds; } @@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!walk) { err = -ENOMEM; goto out_fds; From 04022e1b8a391f1b5d6bb0d7cf02d92a6759fa0b Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:15 +1000 Subject: [PATCH 115/365] memcg: enable accounting for file lock caches User can create file locks for each open file and force kernel to allocate small but long-living objects per each open file. It makes sense to account for these objects to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/b009f4c7-f0ab-c0ec-8e83-918f47d677da@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/locks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 74b2a1dfe8d87..1bc7ede75f189 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -3056,10 +3056,12 @@ static int __init filelock_init(void) int i; flctx_cache = kmem_cache_create("file_lock_ctx", - sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL); + sizeof(struct file_lock_context), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + sizeof(struct file_lock), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); for_each_possible_cpu(i) { struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); From 19c26dde17c10eb83a2dcf99667cadcfc60dd4b8 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:15 +1000 Subject: [PATCH 116/365] memcg: enable accounting for fasync_cache fasync_struct is used by almost all character device drivers to set up the fasync queue, and for regular files by the file lease code. This structure is quite small but long-living and it can be assigned for any open file. It makes sense to account for its allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/1b408625-d71c-0b26-b0b6-9baf00f93e69@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fcntl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index f946bec8f1f1b..714e7c9a5fc44 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1049,7 +1049,8 @@ static int __init fcntl_init(void) __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + sizeof(struct fasync_struct), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } From ed05413c0fb207116456419e4f40444999a9bcc0 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:16 +1000 Subject: [PATCH 117/365] memcg: enable accounting for new namesapces and struct nsproxy Container admin can create new namespaces and force kernel to allocate up to several pages of memory for the namespaces and its associated structures. Net and uts namespaces have enabled accounting for such allocations. It makes sense to account for rest ones to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/5525bcbf-533e-da27-79b7-158686c64e13@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Serge Hallyn Acked-by: Christian Brauner Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namespace.c | 2 +- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 4 ++-- kernel/user_namespace.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index e51b63ae233b5..94a9817851cc4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3307,7 +3307,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); diff --git a/ipc/namespace.c b/ipc/namespace.c index 7bd0766ddc3b1..ae83f0f2651b7 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index f5e8828c109cf..0d5c29879a50b 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index abc01fcad8c7e..eec72ca962e24 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -568,6 +568,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) int __init nsproxy_cache_init(void) { - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index cb5a25a8a0cc7..a46a3723bc662 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -450,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12eab0d2ae28d..aec832801c26c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL); + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ef82d401dde83..6b2e3ca7ee993 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1385,7 +1385,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); From 21f99dd2069340685772027da36fd023da66a8d2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:16 +1000 Subject: [PATCH 118/365] memcg: enable accounting of ipc resources When user creates IPC objects it forces kernel to allocate memory for these long-living objects. It makes sense to account them to restrict the host's memory consumption from inside the memcg-limited container. This patch enables accounting for IPC shared memory segments, messages semaphores and semaphore's undo lists. Link: https://lkml.kernel.org/r/d6507b06-4df6-78f8-6c54-3ae86e3b5339@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- ipc/msg.c | 2 +- ipc/sem.c | 9 +++++---- ipc/shm.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 6810276d6bb98..a0d05775af2c5 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -147,7 +147,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int msgflg = params->flg; - msq = kmalloc(sizeof(*msq), GFP_KERNEL); + msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; diff --git a/ipc/sem.c b/ipc/sem.c index 971e75d28364e..1a8b9f0ac047b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -514,7 +514,7 @@ static struct sem_array *sem_alloc(size_t nsems) if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; - sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL); + sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; @@ -1855,7 +1855,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp) undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1941,7 +1941,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); @@ -2005,7 +2005,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (nsops > ns->sc_semopm) return -E2BIG; if (nsops > SEMOPM_FAST) { - sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + sops = kvmalloc_array(nsops, sizeof(*sops), + GFP_KERNEL_ACCOUNT); if (sops == NULL) return -ENOMEM; } diff --git a/ipc/shm.c b/ipc/shm.c index 748933e376cad..ab749be6d8b71 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -619,7 +619,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; - shp = kmalloc(sizeof(*shp), GFP_KERNEL); + shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT); if (unlikely(!shp)) return -ENOMEM; From 8cf9fdfa4f6f6e85d7bbefb179df252fed4f3c10 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:16 +1000 Subject: [PATCH 119/365] memcg: enable accounting for signals When a user send a signal to any another processes it forces the kernel to allocate memory for 'struct sigqueue' objects. The number of signals is limited by RLIMIT_SIGPENDING resource limit, but even the default settings allow each user to consume up to several megabytes of memory. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/e34e958c-e785-712e-a62a-2c7b66c646c7@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index a3229add44554..8921c4a8419f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4663,7 +4663,7 @@ void __init signals_init(void) { siginfo_buildtime_checks(); - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB From 737c9efae0fdf84e26377c67a808332093939e86 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:16 +1000 Subject: [PATCH 120/365] memcg: enable accounting for posix_timers_cache slab A program may create multiple interval timers using timer_create(). For each timer the kernel preallocates a "queued real-time signal", Consequently, the number of timers is limited by the RLIMIT_SIGPENDING resource limit. The allocated object is quite small, ~250 bytes, but even the default signal limits allow to consume up to 100 megabytes per user. It makes sense to account for them to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/57795560-025c-267c-6b1a-dea852d95530@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Thomas Gleixner Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/time/posix-timers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b1..7363f81dc31ad 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); From 5c7b6c912b44154771b3719feb1f718ff084f038 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:16 +1000 Subject: [PATCH 121/365] memcg: enable accounting for ldt_struct objects Each task can request own LDT and force the kernel to allocate up to 64Kb memory per-mm. There are legitimate workloads with hundreds of processes and there can be hundreds of workloads running on large machines. The unaccounted memory can cause isolation issues between the workloads particularly on highly utilized machines. It makes sense to account for this objects to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/38010594-50fe-c06d-7cb0-d1f77ca422f3@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Borislav Petkov Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index aa15132228da5..525876e7b9f4a 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) if (num_entries > LDT_ENTRIES) return NULL; - new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; @@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) - new_ldt->entries = vzalloc(alloc_size); + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else - new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); From 9edc42a386cc6d763f4ace85cfca0829bc339fb8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 24 Aug 2021 09:59:17 +1000 Subject: [PATCH 122/365] memcg: cleanup racy sum avoidance code We used to have per-cpu memcg and lruvec stats and the readers have to traverse and sum the stats from each cpu. This summing was racy and may expose transient negative values. So, an explicit check was added to avoid such scenarios. Now these stats are moved to rstat infrastructure and are no more per-cpu, so we can remove the fixup for transient negative values. Link: https://lkml.kernel.org/r/20210728012243.3369123-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c4c5b652d3afa..2ee20509ac719 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -977,30 +977,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(memcg->vmstats.state[idx]); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = READ_ONCE(pn->lruvec_stats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(pn->lruvec_stats.state[idx]); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, From db82343a9748d3746dd1cc32c5e6ea981ec07f22 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:17 +1000 Subject: [PATCH 123/365] memcg: replace in_interrupt() by !in_task() in active_memcg() set_active_memcg() uses in_interrupt() check to select proper storage for cgroup: pointer on task struct or per-cpu pointer. It isn't fully correct: obsoleted in_interrupt() includes tasks with disabled BH. It's better to use '!in_task()' instead. Link: https://lkml.org/lkml/2021/7/26/487 Link: https://lkml.kernel.org/r/ed4448b0-4970-616f-7368-ef9dd3cb628d@virtuozzo.com Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts") Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched/mm.h | 2 +- mm/memcontrol.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8894825cc4db2..5561486fddef7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; - if (in_interrupt()) { + if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d8c9afecf985..b09f2639f28b1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -878,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task); static __always_inline struct mem_cgroup *active_memcg(void) { - if (in_interrupt()) + if (!in_task()) return this_cpu_read(int_active_memcg); else return current->active_memcg; From 56d1becc100f6b0443cffd3692dc7fced589963d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 24 Aug 2021 09:59:17 +1000 Subject: [PATCH 124/365] mm: memcontrol: set the correct memcg swappiness restriction Since commit c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") has expended the swappiness value to make swap to be preferred in some systems. We should also change the memcg swappiness restriction to allow memcg swap-preferred. Link: https://lkml.kernel.org/r/d77469b90c45c49953ccbc51e54a1d465bc18f70.1627626255.git.baolin.wang@linux.alibaba.com Fixes: c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") Signed-off-by: Baolin Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b09f2639f28b1..80840e026c2f7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4062,7 +4062,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 100) + if (val > 200) return -EINVAL; if (!mem_cgroup_is_root(memcg)) From 1db5f63c5fb1cc09cea3df8e53ab73c46639e132 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:17 +1000 Subject: [PATCH 125/365] mm, memcg: remove unused functions Since commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat"), last user of memcg_stat_item_in_bytes() is gone. And since commit fa40d1ee9f15 ("mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()"), only the declaration of mem_cgroup_select_victim_node() is remained here. Remove them. Link: https://lkml.kernel.org/r/20210807082835.61281-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Alex Shi Cc: Matthew Wilcox (Oracle) Cc: Vladimir Davydov Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2ee20509ac719..69e6c5462c43e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -593,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif -static __always_inline bool memcg_stat_item_in_bytes(int idx) -{ - if (idx == MEMCG_PERCPU_B) - return true; - return vmstat_item_in_bytes(idx); -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -904,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return !!(memcg->css.flags & CSS_ONLINE); } -/* - * For memory reclaim. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); - void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); From 0e990e60cc4b9f3fb2c79a64783584733a653b66 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:17 +1000 Subject: [PATCH 126/365] mm, memcg: save some atomic ops when flush is already true Add 'else' to save some atomic ops in obj_stock_flush_required() when flush is already true. No functional change intended here. Link: https://lkml.kernel.org/r/20210807082835.61281-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80840e026c2f7..894a24d5ec559 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2246,7 +2246,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; - if (obj_stock_flush_required(stock, root_memcg)) + else if (obj_stock_flush_required(stock, root_memcg)) flush = true; rcu_read_unlock(); From 280b08005af54c262abf5bbb0f9603a0302bc2f8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 Aug 2021 09:59:18 +1000 Subject: [PATCH 127/365] memcg: fix up drain_local_stock comment Thomas and Vlastimil have noticed that the comment in drain_local_stock doesn't quite make sense. It talks about a synchronization with the memory hotplug but there is no actual memory hotplug involvement here. I meant to talk about cpu hotplug here. Fix that up and hopefuly make the comment more helpful by referencing the cpu hotplug callback as well. Link: https://lkml.kernel.org/r/YRDwOhVglJmY7ES5@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 894a24d5ec559..81fa83133c2c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2178,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy) unsigned long flags; /* - * The only protection from memory hotplug vs. drain_stock races is - * that we always operate on local CPU stock here with IRQ disabled + * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled */ local_irq_save(flags); From 3d32d8de91ff84c836f44ed2be33854b53811e6d Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Tue, 24 Aug 2021 09:59:18 +1000 Subject: [PATCH 128/365] selftests/vm: use kselftest skip code for skipped tests There are several test cases in the vm directory are still using exit 0 when they need to be skipped. Use the kselftest framework to skip code instead so it can help us to distinguish the return status. Criterion to filter out what should be fixed in vm directory: grep -r "exit 0" -B1 | grep -i skip This change might cause some false-positives if people are running these test scripts directly and only checking their return codes, which will change from 0 to 4. However I think the impact should be small as most of our scripts here are already using this skip code. And there will be no such issue if running them with the kselftest framework. Link: https://lkml.kernel.org/r/20210823073433.37653-1-po-hsu.lin@canonical.com Signed-off-by: Po-Hsu Lin Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/charge_reserved_hugetlb.sh | 5 ++++- tools/testing/selftests/vm/hugetlb_reparenting_test.sh | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh index 18d33684faade..fe8fcfb334e06 100644 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh @@ -1,11 +1,14 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi fault_limit_file=limit_in_bytes diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh index d11d1febccc3b..4a9a3afe9fd4d 100644 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh @@ -1,11 +1,14 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi usage_file=usage_in_bytes From 0e0cf30edccbf9cdb926f7ea8a60665d65e7f587 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:18 +1000 Subject: [PATCH 129/365] lazy tlb: introduce lazy mm refcount helper functions Patch series "shoot lazy tlbs", v4. On a 16-socket 192-core POWER8 system, a context switching benchmark with as many software threads as CPUs (so each switch will go in and out of idle), upstream can achieve a rate of about 1 million context switches per second. After this series it goes up to 118 million. This patch (of 4): Add explicit _lazy_tlb annotated functions for lazy mm refcounting. This makes lazy mm references more obvious, and allows explicit refcounting to be removed if it is not used. If a kernel thread's current lazy tlb mm happens to be the one it wants to use, then kthread_use_mm() cleverly transfers the mm refcount from the lazy tlb mm reference to the returned reference. If the lazy tlb mm reference is no longer identical to a normal reference, this trick does not work, so that is changed to be explicit about the two references. Link: https://lkml.kernel.org/r/20210605014216.446867-1-npiggin@gmail.com Link: https://lkml.kernel.org/r/20210605014216.446867-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Andy Lutomirski Cc: Anton Blanchard Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/mach-rpc/ecard.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++-- fs/exec.c | 4 ++-- include/linux/sched/mm.h | 11 +++++++++++ kernel/cpu.c | 2 +- kernel/exit.c | 2 +- kernel/kthread.c | 11 +++++++---- kernel/sched/core.c | 15 ++++++++------- 9 files changed, 34 insertions(+), 19 deletions(-) diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c index 827b50f1c73e6..1b4a41aad7932 100644 --- a/arch/arm/mach-rpc/ecard.c +++ b/arch/arm/mach-rpc/ecard.c @@ -253,7 +253,7 @@ static int ecard_init_mm(void) current->mm = mm; current->active_mm = mm; activate_mm(active_mm, mm); - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); ecard_init_pgtables(mm); return 0; } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 447b78a87c8f2..a492170b5ba0b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1556,7 +1556,7 @@ void start_secondary(void *unused) if (IS_ENABLED(CONFIG_PPC32)) setup_kup(); - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; smp_store_cpu_info(cpu); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index aefc100d79a7c..2710a61d7ef20 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -785,10 +785,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) if (current->active_mm == mm) { WARN_ON_ONCE(current->mm != NULL); /* Is a kernel thread and is using mm as the lazy tlb */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); - mmdrop(mm); + mmdrop_lazy_tlb(mm); } /* diff --git a/fs/exec.c b/fs/exec.c index 38f63451b9282..17ddaad5462fe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1026,9 +1026,9 @@ static int exec_mmap(struct mm_struct *mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); - return 0; + } else { + mmdrop_lazy_tlb(active_mm); } - mmdrop(active_mm); return 0; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef7..f7a0b347fecbd 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +/* Helpers for lazy TLB mm refcounting */ +static inline void mmgrab_lazy_tlb(struct mm_struct *mm) +{ + mmgrab(mm); +} + +static inline void mmdrop_lazy_tlb(struct mm_struct *mm) +{ + mmdrop(mm); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/kernel/cpu.c b/kernel/cpu.c index 804b847912dc0..79882ce1f2b53 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -603,7 +603,7 @@ static int finish_cpu(unsigned int cpu) */ if (mm != &init_mm) idle->active_mm = &init_mm; - mmdrop(mm); + mmdrop_lazy_tlb(mm); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb4..3e9ec041a4e59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -475,7 +475,7 @@ static void exit_mm(void) __set_current_state(TASK_RUNNING); mmap_read_lock(mm); } - mmgrab(mm); + mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); diff --git a/kernel/kthread.c b/kernel/kthread.c index 5b37a8567168b..e82a17863b098 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1350,14 +1350,14 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); + mmgrab(mm); + task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; - if (active_mm != mm) { - mmgrab(mm); + if (active_mm != mm) tsk->active_mm = mm; - } tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); @@ -1377,7 +1377,7 @@ void kthread_use_mm(struct mm_struct *mm) * mmdrop(), or explicitly with smp_mb(). */ if (active_mm != mm) - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); else smp_mb(); @@ -1411,10 +1411,13 @@ void kthread_unuse_mm(struct mm_struct *mm) local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); + mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); + + mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc0441344..030348d3e6d29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4572,13 +4572,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) * rq->curr, before returning to userspace, so provide them here: * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly - * provided by mmdrop(), + * provided by mmdrop_lazy_tlb(), * - a sync_core for SYNC_CORE. */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_lazy_tlb(mm); } + if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -4641,9 +4642,9 @@ context_switch(struct rq *rq, struct task_struct *prev, /* * kernel -> kernel lazy + transfer active - * user -> kernel lazy + mmgrab() active + * user -> kernel lazy + mmgrab_lazy_tlb() active * - * kernel -> user switch + mmdrop() active + * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch */ if (!next->mm) { // to kernel @@ -4651,7 +4652,7 @@ context_switch(struct rq *rq, struct task_struct *prev, next->active_mm = prev->active_mm; if (prev->mm) // from user - mmgrab(prev->active_mm); + mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; } else { // to user @@ -4667,7 +4668,7 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ + /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } @@ -9037,7 +9038,7 @@ void __init sched_init(void) /* * The boot idle thread does lazy MMU switching as well: */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); enter_lazy_tlb(&init_mm, current); /* From bfb4fd128488fb453844f6802259876ad9595d89 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:18 +1000 Subject: [PATCH 130/365] lazy-tlb-introduce-lazy-mm-refcount-helper-functions-fix Fix a refcounting bug in kthread_use_mm (the mm reference is increased unconditionally now, but the lazy tlb refcount is still only dropped only if mm != active_mm). Link: https://lkml.kernel.org/r/1623125298.bx63h3mopj.astroid@bobo.none Signed-off-by: Nicholas Piggin Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/kthread.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index e82a17863b098..83ed75d531b4b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1350,6 +1350,11 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); + /* + * It's possible that tsk->active_mm == mm here, but we must + * still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), because lazy + * mm may not have its own refcount (see mmgrab/drop_lazy_tlb()). + */ mmgrab(mm); task_lock(tsk); @@ -1374,12 +1379,9 @@ void kthread_use_mm(struct mm_struct *mm) * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by - * mmdrop(), or explicitly with smp_mb(). + * mmdrop_lazy_tlb(). */ - if (active_mm != mm) - mmdrop_lazy_tlb(active_mm); - else - smp_mb(); + mmdrop_lazy_tlb(active_mm); to_kthread(tsk)->oldfs = force_uaccess_begin(); } From ccb3587b3ce93b2b959dbace94200d02ae260a5e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:18 +1000 Subject: [PATCH 131/365] lazy tlb: allow lazy tlb mm refcounting to be configurable Add CONFIG_MMU_TLB_REFCOUNT which enables refcounting of the lazy tlb mm when it is context switched. This can be disabled by architectures that don't require this refcounting if they clean up lazy tlb mms when the last refcount is dropped. Currently this is always enabled, which is what existing code does, so the patch is effectively a no-op. Rename rq->prev_mm to rq->prev_lazy_mm, because that's what it is. Link: https://lkml.kernel.org/r/20210605014216.446867-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Andy Lutomirski Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/Kconfig | 4 ++++ include/linux/sched/mm.h | 13 +++++++++++-- kernel/sched/core.c | 22 ++++++++++++++++++---- kernel/sched/sched.h | 4 +++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 129df498a8e12..9dddd38d2bf41 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -425,6 +425,10 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM irqs disabled over activate_mm. Architectures that do IPI based TLB shootdowns should enable this. +# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references. +config MMU_LAZY_TLB_REFCOUNT + def_bool y + config ARCH_HAVE_NMI_SAFE_CMPXCHG bool diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f7a0b347fecbd..939dee9f5bc83 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm) /* Helpers for lazy TLB mm refcounting */ static inline void mmgrab_lazy_tlb(struct mm_struct *mm) { - mmgrab(mm); + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmgrab(mm); } static inline void mmdrop_lazy_tlb(struct mm_struct *mm) { - mmdrop(mm); + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { + mmdrop(mm); + } else { + /* + * mmdrop_lazy_tlb must provide a full memory barrier, see the + * membarrier comment finish_task_switch which relies on this. + */ + smp_mb(); + } } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 030348d3e6d29..df4002813f023 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4513,7 +4513,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; + struct mm_struct *mm = NULL; long prev_state; /* @@ -4532,7 +4532,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) current->comm, current->pid, preempt_count())) preempt_count_set(FORK_PREEMPT_COUNT); - rq->prev_mm = NULL; +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT + mm = rq->prev_lazy_mm; + rq->prev_lazy_mm = NULL; +#endif /* * A task struct has one reference for the use as "current". @@ -4668,9 +4671,20 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel - /* will mmdrop_lazy_tlb() in finish_task_switch(). */ - rq->prev_mm = prev->active_mm; +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT + /* Will mmdrop_lazy_tlb() in finish_task_switch(). */ + rq->prev_lazy_mm = prev->active_mm; prev->active_mm = NULL; +#else + /* + * Without MMU_LAZY_TLB_REFCOUNT there is no lazy + * tracking (because no rq->prev_lazy_mm) in + * finish_task_switch, so no mmdrop_lazy_tlb(), so no + * memory barrier for membarrier (see the membarrier + * comment in finish_task_switch()). Do it here. + */ + smp_mb(); +#endif } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14a41a243f7ba..1a13bb32eba2c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -967,7 +967,9 @@ struct rq { struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; - struct mm_struct *prev_mm; +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT + struct mm_struct *prev_lazy_mm; +#endif unsigned int clock_update_flags; u64 clock; From b5b6794d485c12e424efdba25a70b9da0d01b7d1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:19 +1000 Subject: [PATCH 132/365] lazy-tlb-allow-lazy-tlb-mm-refcounting-to-be-configurable-fix fix comment Cc: Nicholas Piggin Cc: Andy Lutomirski Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 939dee9f5bc83..fd6e4d14f477d 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -63,7 +63,8 @@ static inline void mmdrop_lazy_tlb(struct mm_struct *mm) } else { /* * mmdrop_lazy_tlb must provide a full memory barrier, see the - * membarrier comment finish_task_switch which relies on this. + * membarrier comment in finish_task_switch which relies on + * this. */ smp_mb(); } From 2b4ab2e7181a6be0a1348d4d9a57adec830a1723 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:19 +1000 Subject: [PATCH 133/365] lazy-tlb-allow-lazy-tlb-mm-refcounting-to-be-configurable-fix-2 Explain the requirements for lazy tlb mm refcounting in the comment, to help with archs that may want to disable this by some means other than MMU_LAZY_TLB_SHOOTDOWN. Link: https://lkml.kernel.org/r/1623121605.j47gdpccep.astroid@bobo.none Signed-off-by: Nicholas Piggin Cc: Anton Blanchard Cc: Andy Lutomirski Cc: Randy Dunlap Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 9dddd38d2bf41..16e7a3eddb663 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -426,6 +426,16 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM shootdowns should enable this. # Use normal mm refcounting for MMU_LAZY_TLB kernel thread references. +# MMU_LAZY_TLB_REFCOUNT=n can improve the scalability of context switching +# to/from kernel threads when the same mm is running on a lot of CPUs (a large +# multi-threaded application), by reducing contention on the mm refcount. +# +# This can be disabled if the architecture ensures no CPUs are using an mm as a +# "lazy tlb" beyond its final refcount (i.e., by the time __mmdrop frees the mm +# or its kernel page tables). This could be arranged by arch_exit_mmap(), or +# final exit(2) TLB flush, for example. arch code must also ensure the +# _lazy_tlb variants of mmgrab/mmdrop are used when dropping the lazy reference +# to a kthread ->active_mm (non-arch code has been converted already). config MMU_LAZY_TLB_REFCOUNT def_bool y From f88cd7cbf28955aabf5a4b131c02998cf9a232cd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:19 +1000 Subject: [PATCH 134/365] lazy tlb: shoot lazies, a non-refcounting lazy tlb option On big systems, the mm refcount can become highly contented when doing a lot of context switching with threaded applications (particularly switching between the idle thread and an application thread). Abandoning lazy tlb slows switching down quite a bit in the important user->idle->user cases, so instead implement a non-refcounted scheme that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down any remaining lazy ones. Shootdown IPIs are some concern, but they have not been observed to be a big problem with this scheme (the powerpc implementation generated 314 additional interrupts on a 144 CPU system during a kernel compile). There are a number of strategies that could be employed to reduce IPIs if they turn out to be a problem for some workload. Link: https://lkml.kernel.org/r/20210605014216.446867-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Anton Blanchard Cc: Andy Lutomirski Cc: Randy Dunlap Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/Kconfig | 13 +++++++++++++ kernel/fork.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 16e7a3eddb663..57477e18e5a99 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -438,6 +438,19 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM # to a kthread ->active_mm (non-arch code has been converted already). config MMU_LAZY_TLB_REFCOUNT def_bool y + depends on !MMU_LAZY_TLB_SHOOTDOWN + +# Instead of refcounting the lazy mm struct for kernel thread references +# (which can cause contention with multi-threaded apps on large multiprocessor +# systems), this option causes __mmdrop to IPI all CPUs in the mm_cpumask and +# switch to init_mm if they were using the to-be-freed mm as the lazy tlb. To +# implement this, architectures must use _lazy_tlb variants of mm refcounting +# when releasing kernel thread mm references, and mm_cpumask must include at +# least all possible CPUs in which the mm might be lazy, at the time of the +# final mmdrop. mmgrab/mmdrop in arch/ code must be switched to _lazy_tlb +# postfix as necessary. +config MMU_LAZY_TLB_SHOOTDOWN + bool config ARCH_HAVE_NMI_SAFE_CMPXCHG bool diff --git a/kernel/fork.c b/kernel/fork.c index bc94b2cc59956..abed9591f82c0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -674,6 +674,53 @@ static void check_mm(struct mm_struct *mm) #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static void do_shoot_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + WARN_ON_ONCE(current->mm); + current->active_mm = &init_mm; + switch_mm(mm, &init_mm, current); + } +} + +static void do_check_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + WARN_ON_ONCE(current->active_mm == mm); +} + +static void shoot_lazy_tlbs(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * IPI overheads have not found to be expensive, but they could + * be reduced in a number of possible ways, for example (in + * roughly increasing order of complexity): + * - A batch of mms requiring IPIs could be gathered and freed + * at once. + * - CPUs could store their active mm somewhere that can be + * remotely checked without a lock, to filter out + * false-positives in the cpumask. + * - After mm_users or mm_count reaches zero, switching away + * from the mm could clear mm_cpumask to reduce some IPIs + * (some batching or delaying would help). + * - A delayed freeing and RCU-like quiescing sequence based on + * mm switching to avoid IPIs completely. + */ + on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); + if (IS_ENABLED(CONFIG_DEBUG_VM)) + on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); + } else { + /* + * In this case, lazy tlb mms are refounted and would not reach + * __mmdrop until all CPUs have switched away and mmdrop()ed. + */ + } +} + /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by @@ -683,6 +730,10 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); + + /* Ensure no CPUs are using this as their lazy tlb mm */ + shoot_lazy_tlbs(mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); From e61756bf70dbcd62a7c921bb38e173e508f6efc4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:19 +1000 Subject: [PATCH 135/365] lazy-tlb-shoot-lazies-a-non-refcounting-lazy-tlb-option-fix Update the comment to be clearer, and account for the improvement to MMU_LAZY_TLB_REFCOUNT comment. Link: https://lkml.kernel.org/r/1623121901.mszkmmum0n.astroid@bobo.none Signed-off-by: Nicholas Piggin Cc: Anton Blanchard Cc: Andy Lutomirski Cc: Randy Dunlap Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/Kconfig | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 57477e18e5a99..1baac7bfdd441 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -440,15 +440,16 @@ config MMU_LAZY_TLB_REFCOUNT def_bool y depends on !MMU_LAZY_TLB_SHOOTDOWN -# Instead of refcounting the lazy mm struct for kernel thread references -# (which can cause contention with multi-threaded apps on large multiprocessor -# systems), this option causes __mmdrop to IPI all CPUs in the mm_cpumask and -# switch to init_mm if they were using the to-be-freed mm as the lazy tlb. To -# implement this, architectures must use _lazy_tlb variants of mm refcounting -# when releasing kernel thread mm references, and mm_cpumask must include at -# least all possible CPUs in which the mm might be lazy, at the time of the -# final mmdrop. mmgrab/mmdrop in arch/ code must be switched to _lazy_tlb -# postfix as necessary. +# This option allows MMU_LAZY_TLB_REFCOUNT=n. It ensures no CPUs are using an +# mm as a lazy tlb beyond its last reference count, by shooting down these +# users before the mm is deallocated. __mmdrop() first IPIs all CPUs that may +# be using the mm as a lazy tlb, so that they may switch themselves to using +# init_mm for their active mm. mm_cpumask(mm) is used to determine which CPUs +# may be using mm as a lazy tlb mm. +# +# To implement this, an arch must ensure mm_cpumask(mm) contains at least all +# possible CPUs in which the mm is lazy, and it must meet the requirements for +# MMU_LAZY_TLB_REFCOUNT=n (see above). config MMU_LAZY_TLB_SHOOTDOWN bool From c7779fb1e6082716650f4e66a631886a35fd1c80 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Aug 2021 09:59:19 +1000 Subject: [PATCH 136/365] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN On a 16-socket 192-core POWER8 system, a context switching benchmark with as many software threads as CPUs (so each switch will go in and out of idle), upstream can achieve a rate of about 1 million context switches per second. After this patch it goes up to 118 million. No real datya for real world workloads unfortunately. I think it's always been a "known" cacheline, it just showed up badly on will-it-scale tests recently when Anton was doing a sweep of low hanging scalability issues on big systems. We have some very big systems running certain in-memory databases that get into very high contention conditions on mutexes that push context switch rates right up and with idle times pretty high, which would get a lot of parallel context switching between user and idle thread, we might be getting a bit of this contention there. It's not something at the top of profiles though. And on multi-threaded workloads like this, the normal refcounting of the user mm still has fundmaental contention. It's tricky to get the change tested on these workloads (machine time is very limited and I can't drive the software). I suspect it could also show in things that do high net or disk IO rates (enough to need a lot of cores), and do some user processing steps along the way. You'd potentially get a lot of idle switching. This infrastructure could be beneficial to other architectures. The cacheline is going to bounce in the same situations on other archs, so I would say yes. Rik at one stage had some patches to try avoid it for x86 some years ago, I don't know what happened to those. The way powerpc has to maintain mm_cpumask for its TLB flushing makes it relatively easy to do this shootdown, and we decided the additional IPIs were less of a concern than the bouncing. Others have different concerns, but I tried to make it generic and add comments explaining what other archs can do, or possibly different ways it might be achieved. Link: https://lkml.kernel.org/r/20210605014216.446867-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Andy Lutomirski Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 663766fbf5055..2e213ec6ec05b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -253,6 +253,7 @@ config PPC select IRQ_FORCED_THREADING select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE + select MMU_LAZY_TLB_SHOOTDOWN if PPC_BOOK3S_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_SG_DMA_LENGTH From d4e2d12443269c8df1e82c5fe868051b3bb92a0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:59:20 +1000 Subject: [PATCH 137/365] mmc: JZ4740: remove the flush_kernel_dcache_page call in jz4740_mmc_read_data Patch series "_kernel_dcache_page fixes and removal". While looking to convert the block layer away from kmap_atomic towards kmap_local_page and prefeably the helpers that abstract it away I noticed that a few block drivers directly or implicitly call flush_kernel_dcache_page before kunmapping a page that has been written to. flush_kernel_dcache_page is documented to to be used in such cases, but flush_dcache_page is actually required when the page could be in the page cache and mapped to userspace, which is pretty much always the case when kmapping an arbitrary page. Unfortunately the documentation doesn't exactly make that clear, which lead to this misused. And it turns out that only the copy_strings / copy_string_kernel in the exec code were actually correct users of flush_kernel_dcache_page, which is why I think we should just remove it and eat the very minor overhead in exec rather than confusing poor driver writers. This patch (of 6): MIPS now implements flush_kernel_dcache_page (as an alias to flush_dcache_page). Link: https://lkml.kernel.org/r/20210712060928.4161649-1-hch@lst.de Link: https://lkml.kernel.org/r/20210712060928.4161649-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Cc: "James E.J. Bottomley" Cc: Russell King Cc: Guo Ren Cc: Thomas Bogendoerfer Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Helge Deller Cc: Yoshinori Sato Cc: Rich Felker Cc: Geoff Levand Cc: Paul Cercueil Cc: Ulf Hansson Cc: Alex Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/mmc/host/jz4740_mmc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index cb1a64a5c256f..80a2c270d502e 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -578,10 +578,6 @@ static bool jz4740_mmc_read_data(struct jz4740_mmc_host *host, } } data->bytes_xfered += miter->length; - - /* This can go away once MIPS implements - * flush_kernel_dcache_page */ - flush_dcache_page(miter->page); } sg_miter_stop(miter); From 870408139763c561c9c2ed9eef4bbb024f4750cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:59:20 +1000 Subject: [PATCH 138/365] mmc: mmc_spi: replace flush_kernel_dcache_page with flush_dcache_page Pages passed to block drivers can be mapped page cache pages, so we must use flush_dcache_page here instead of the more limited flush_kernel_dcache_page that is intended for highmem pages only. Link: https://lkml.kernel.org/r/20210712060928.4161649-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/mmc/host/mmc_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 65c65bb5737fc..3d28a3d3001be 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -948,7 +948,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd, /* discard mappings */ if (direction == DMA_FROM_DEVICE) - flush_kernel_dcache_page(sg_page(sg)); + flush_dcache_page(sg_page(sg)); kunmap(sg_page(sg)); if (dma_dev) dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir); From a942212945866e9fe5ff5ed99528035dfb809e43 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:59:20 +1000 Subject: [PATCH 139/365] scatterlist: replace flush_kernel_dcache_page with flush_dcache_page Pages used in scatterlist can be mapped page cache pages (and often are), so we must use flush_dcache_page here instead of the more limited flush_kernel_dcache_page that is intended for highmem pages only. Also remove the PageSlab check given that page_mapping_file as used by the flush_dcache_page implementations already contains that check. Link: https://lkml.kernel.org/r/20210712060928.4161649-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/scatterlist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 27efa61781538..627aa84f8bbd5 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -887,9 +887,8 @@ void sg_miter_stop(struct sg_mapping_iter *miter) miter->__offset += miter->consumed; miter->__remaining -= miter->consumed; - if ((miter->__flags & SG_MITER_TO_SG) && - !PageSlab(miter->page)) - flush_kernel_dcache_page(miter->page); + if (miter->__flags & SG_MITER_TO_SG) + flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { WARN_ON_ONCE(preemptible()); From e1a7557249f9ea04a7ffa98fae9ad34c4e5992ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:59:20 +1000 Subject: [PATCH 140/365] mm: remove flush_kernel_dcache_page flush_kernel_dcache_page is a rather confusing interface that implements a subset of flush_dcache_page by not being able to properly handle page cache mapped pages. The only callers left are in the exec code as all other previous callers were incorrect as they could have dealt with page cache pages. Replace the calls to flush_kernel_dcache_page with calls to flush_dcache_page, which for all architectures does either exactly the same thing, can contains one or more of the following: 1) an optimization to defer the cache flush for page cache pages not mapped into userspace 2) additional flushing for mapped page cache pages if cache aliases are possible Link: https://lkml.kernel.org/r/20210712060928.4161649-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Reviewed-by: Ira Weiny Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/core-api/cachetlb.rst | 86 ++++++++----------- .../translations/zh_CN/core-api/cachetlb.rst | 9 -- arch/arm/include/asm/cacheflush.h | 4 +- arch/arm/mm/flush.c | 33 ------- arch/arm/mm/nommu.c | 6 -- arch/csky/abiv1/cacheflush.c | 11 --- arch/csky/abiv1/inc/abi/cacheflush.h | 4 +- arch/mips/include/asm/cacheflush.h | 8 +- arch/nds32/include/asm/cacheflush.h | 3 +- arch/nds32/mm/cacheflush.c | 9 -- arch/parisc/include/asm/cacheflush.h | 8 +- arch/parisc/kernel/cache.c | 3 +- arch/sh/include/asm/cacheflush.h | 8 +- block/blk-map.c | 2 +- fs/exec.c | 6 +- include/linux/highmem.h | 5 +- tools/testing/scatterlist/linux/mm.h | 1 - 17 files changed, 51 insertions(+), 155 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index fe4290e267296..8aed9103e48a3 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -271,10 +271,15 @@ maps this page at its virtual address. ``void flush_dcache_page(struct page *page)`` - Any time the kernel writes to a page cache page, _OR_ - the kernel is about to read from a page cache page and - user space shared/writable mappings of this page potentially - exist, this routine is called. + This routines must be called when: + + a) the kernel did write to a page that is in the page cache page + and / or in high memory + b) the kernel is about to read from a page cache page and user space + shared/writable mappings of this page potentially exist. Note + that {get,pin}_user_pages{_fast} already call flush_dcache_page + on any page found in the user address space and thus driver + code rarely needs to take this into account. .. note:: @@ -284,38 +289,34 @@ maps this page at its virtual address. handling vfs symlinks in the page cache need not call this interface at all. - The phrase "kernel writes to a page cache page" means, - specifically, that the kernel executes store instructions - that dirty data in that page at the page->virtual mapping - of that page. It is important to flush here to handle - D-cache aliasing, to make sure these kernel stores are - visible to user space mappings of that page. - - The corollary case is just as important, if there are users - which have shared+writable mappings of this file, we must make - sure that kernel reads of these pages will see the most recent - stores done by the user. - - If D-cache aliasing is not an issue, this routine may - simply be defined as a nop on that architecture. - - There is a bit set aside in page->flags (PG_arch_1) as - "architecture private". The kernel guarantees that, - for pagecache pages, it will clear this bit when such - a page first enters the pagecache. - - This allows these interfaces to be implemented much more - efficiently. It allows one to "defer" (perhaps indefinitely) - the actual flush if there are currently no user processes - mapping this page. See sparc64's flush_dcache_page and - update_mmu_cache implementations for an example of how to go - about doing this. - - The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree, just mark the architecture - private page flag bit. Later, in update_mmu_cache(), a check is - made of this flag bit, and if set the flush is done and the flag - bit is cleared. + The phrase "kernel writes to a page cache page" means, specifically, + that the kernel executes store instructions that dirty data in that + page at the page->virtual mapping of that page. It is important to + flush here to handle D-cache aliasing, to make sure these kernel stores + are visible to user space mappings of that page. + + The corollary case is just as important, if there are users which have + shared+writable mappings of this file, we must make sure that kernel + reads of these pages will see the most recent stores done by the user. + + If D-cache aliasing is not an issue, this routine may simply be defined + as a nop on that architecture. + + There is a bit set aside in page->flags (PG_arch_1) as "architecture + private". The kernel guarantees that, for pagecache pages, it will + clear this bit when such a page first enters the pagecache. + + This allows these interfaces to be implemented much more efficiently. + It allows one to "defer" (perhaps indefinitely) the actual flush if + there are currently no user processes mapping this page. See sparc64's + flush_dcache_page and update_mmu_cache implementations for an example + of how to go about doing this. + + The idea is, first at flush_dcache_page() time, if page_file_mapping() + returns a mapping, and mapping_mapped on that mapping returns %false, + just mark the architecture private page flag bit. Later, in + update_mmu_cache(), a check is made of this flag bit, and if set the + flush is done and the flag bit is cleared. .. important:: @@ -351,19 +352,6 @@ maps this page at its virtual address. architectures). For incoherent architectures, it should flush the cache of the page at vmaddr. - ``void flush_kernel_dcache_page(struct page *page)`` - - When the kernel needs to modify a user page is has obtained - with kmap, it calls this function after all modifications are - complete (but before kunmapping it) to bring the underlying - page up to date. It is assumed here that the user has no - incoherent cached copies (i.e. the original page was obtained - from a mechanism like get_user_pages()). The default - implementation is a nop and should remain so on all coherent - architectures. On incoherent architectures, this should flush - the kernel cache for page (using page_address(page)). - - ``void flush_icache_range(unsigned long start, unsigned long end)`` When the kernel stores into addresses that it will execute diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst index 8376485a534d1..55827b8a7c535 100644 --- a/Documentation/translations/zh_CN/core-api/cachetlb.rst +++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst @@ -298,15 +298,6 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。 用。默认的实现是nop(对于所有相干的架构应该保持这样)。对于不一致性 的架构,它应该刷新vmaddr处的页面缓存。 - ``void flush_kernel_dcache_page(struct page *page)`` - - 当内核需要修改一个用kmap获得的用户页时,它会在所有修改完成后(但在 - kunmapping之前)调用这个函数,以使底层页面达到最新状态。这里假定用 - 户没有不一致性的缓存副本(即原始页面是从类似get_user_pages()的机制 - 中获得的)。默认的实现是一个nop,在所有相干的架构上都应该如此。在不 - 一致性的架构上,这应该刷新内核缓存中的页面(使用page_address(page))。 - - ``void flush_icache_range(unsigned long start, unsigned long end)`` 当内核存储到它将执行的地址中时(例如在加载模块时),这个函数被调用。 diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 2e24e765e6d3a..5e56288e343bb 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -291,6 +291,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { if ((cache_is_vivt() || cache_is_vipt_aliasing())) @@ -312,9 +313,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(vma, page, vmaddr); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6d89db7895d14..7ff9feea13a6a 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -345,39 +345,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -/* - * Ensure cache coherency for the kernel mapping of this page. We can - * assume that the page is pinned via kmap. - * - * If the page only exists in the page cache and there are no user - * space mappings, this is a no-op since the page was already marked - * dirty at creation. Otherwise, we need to flush the dirty kernel - * cache lines directly. - */ -void flush_kernel_dcache_page(struct page *page) -{ - if (cache_is_vivt() || cache_is_vipt_aliasing()) { - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) { - void *addr; - - addr = page_address(page); - /* - * kmap_atomic() doesn't set the page virtual - * address for highmem pages, and - * kunmap_atomic() takes care of cache - * flushing already. - */ - if (!IS_ENABLED(CONFIG_HIGHMEM) || addr) - __cpuc_flush_dcache_area(addr, PAGE_SIZE); - } - } -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - /* * Flush an anonymous page so that users of get_user_pages() * can safely access the data. The expected sequence is: diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 8b3d7191e2b88..2658f52903da6 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -void flush_kernel_dcache_page(struct page *page) -{ - __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *dst, const void *src, unsigned long len) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 07ff17ea33dee..fb91b069dc69f 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, } } -void flush_kernel_dcache_page(struct page *page) -{ - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) - dcache_wbinv_all(); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 6cab7afae9626..ed62e2066ba76 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *); #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { dcache_wbinv_all(); diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index d687b40b9fbbf..b3dc9c589442a 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void) kunmap_coherent(); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - BUG_ON(cpu_has_dc_aliases && PageHighMem(page)); - flush_dcache_page(page); -} - +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 /* * For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a * cache writeback and invalidate operation. diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7d6824f7c0e8d..c2a222ebfa2af 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -36,8 +36,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -void flush_kernel_dcache_page(struct page *page); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *addr, int size); void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index ad5344ef5d334..07aac65d1cab4 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma, local_irq_restore(flags); } -void flush_kernel_dcache_page(struct page *page) -{ - unsigned long flags; - local_irq_save(flags); - cpu_dcache_wbinval_page((unsigned long)page_address(page)); - local_irq_restore(flags); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_kernel_vmap_range(void *addr, int size) { unsigned long flags; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 99663fc1f997f..eef0096db5f88 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -36,16 +36,12 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE void flush_kernel_dcache_page_addr(void *addr); -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); @@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ - flush_kernel_dcache_page(page); \ + flush_kernel_dcache_page_addr(page_address(page)); \ flush_kernel_icache_page(page_address(page)); \ } while (0) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 86a1a63563fd5..39e02227e2310 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page) return; } - flush_kernel_dcache_page(page); + flush_kernel_dcache_page_addr(page_address(page)); if (!mapping) return; @@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page); /* Defined in arch/parisc/kernel/pacache.S */ EXPORT_SYMBOL(flush_kernel_dcache_range_asm); -EXPORT_SYMBOL(flush_kernel_dcache_page_asm); EXPORT_SYMBOL(flush_data_cache_local); EXPORT_SYMBOL(flush_kernel_icache_range_asm); diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 4486a865ff62f..372afa82fee62 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, if (boot_cpu_data.dcache.n_aliases && PageAnon(page)) __flush_anon_page(page, vmaddr); } + +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { __flush_wback_region(addr, size); @@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size) __flush_invalidate_region(addr, size); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_dcache_page(page); -} - extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len); diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb7..4639bc6b5c62f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -309,7 +309,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, static void bio_invalidate_vmalloc_pages(struct bio *bio) { -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; diff --git a/fs/exec.c b/fs/exec.c index 17ddaad5462fe..eb2a99793018d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -574,7 +574,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, } if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -592,7 +592,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, ret = 0; out: if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -634,7 +634,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_kernel_dcache_page(page); + flush_dcache_page(page); kunmap_atomic(kaddr); put_arg_page(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d9a606a9fc64a..b4c49f9cc379e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page } #endif -#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ -} +#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index f9a12005fcea8..16ec895bbe5ff 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -127,7 +127,6 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags) #define kmemleak_free(a) #define PageSlab(p) (0) -#define flush_kernel_dcache_page(p) #define MAX_ERRNO 4095 From b061aaa729ccc3a45e23f8e08917be84217e5f17 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 24 Aug 2021 09:59:20 +1000 Subject: [PATCH 141/365] mm,do_huge_pmd_numa_page: remove unnecessary TLB flushing code Before commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling"), the TLB flushing is done in do_huge_pmd_numa_page() itself via flush_tlb_range(). But after commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling"), the TLB flushing is done in migrate_pages() as in the following code path anyway. do_huge_pmd_numa_page migrate_misplaced_page migrate_pages So now, the TLB flushing code in do_huge_pmd_numa_page() becomes unnecessary. So the code is deleted in this patch to simplify the code. This is only code cleanup, there's no visible performance difference. The mmu_notifier_invalidate_range() in do_huge_pmd_numa_page() is deleted too. Because migrate_pages() takes care of that too when CPU TLB is flushed. Link: https://lkml.kernel.org/r/20210720065529.716031-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Zi Yan Reviewed-by: Yang Shi Cc: Dan Carpenter Cc: Mel Gorman Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vasily Gorbik Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/huge_memory.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 890fb73ac89bc..5e9ef0fc261e9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1440,32 +1440,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) goto out; } - /* - * Since we took the NUMA fault, we must have observed the !accessible - * bit. Make sure all other CPUs agree with that, to avoid them - * modifying the page we're about to migrate. - * - * Must be done under PTL such that we'll observe the relevant - * inc_tlb_flush_pending(). - * - * We are not sure a pending tlb flush here is for a huge page - * mapping or not. Hence use the tlb range variant - */ - if (mm_tlb_flush_pending(vma->vm_mm)) { - flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); - /* - * change_huge_pmd() released the pmd lock before - * invalidating the secondary MMUs sharing the primary - * MMU pagetables (with ->invalidate_range()). The - * mmu_notifier_invalidate_range_end() (which - * internally calls ->invalidate_range()) in - * change_pmd_range() will run after us, so we can't - * rely on it here and we need an explicit invalidate. - */ - mmu_notifier_invalidate_range(vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); - } - pmd = pmd_modify(oldpmd, vma->vm_page_prot); page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) From a8829ea678f708d0648270e8637b6730184b8fdb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 24 Aug 2021 09:59:21 +1000 Subject: [PATCH 142/365] mm: change fault_in_pages_* to have an unsigned size parameter fault_in_pages_writeable() and fault_in_pages_readable() treat the size parameter as unsigned, doing pointer math with the value, so make this explicit and set it to be a size_t type which all callers currently treat it as anyway. This solves the issue where static checkers get nervous seeing pointer arithmetic happening with a signed value. Link: https://lkml.kernel.org/r/20210727111136.457638-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reported-by: Jordy Zomer Cc: Matthew Wilcox Cc: David Howells Cc: William Kucharski Cc: "Darrick J. Wong" Cc: Hugh Dickins Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b071babc66299..45f5ab7496cc6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -786,7 +786,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. */ -static inline int fault_in_pages_writeable(char __user *uaddr, int size) +static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) { char __user *end = uaddr + size - 1; @@ -813,7 +813,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) return 0; } -static inline int fault_in_pages_readable(const char __user *uaddr, int size) +static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) { volatile char c; const char __user *end = uaddr + size - 1; From 78ed3bb6526904930fc6344dcb728f06b4091812 Mon Sep 17 00:00:00 2001 From: Luigi Rizzo Date: Tue, 24 Aug 2021 09:59:21 +1000 Subject: [PATCH 143/365] mm/pagemap: add mmap_assert_locked() annotations to find_vma*() find_vma() and variants need protection when used. This patch adds mmap_assert_lock() calls in the functions. To make sure the invariant is satisfied, we also need to add a mmap_read_loc() around the get_user_pages_remote() call in get_arg_page(). The lock is not strictly necessary because the mm has been newly created, but the extra cost is limited because the same mutex was also acquired shortly before in __bprm_mm_init(), so it is hot and uncontended. Link: https://lkml.kernel.org/r/20210731175341.3458608-1-lrizzo@google.com Signed-off-by: Luigi Rizzo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/exec.c | 2 ++ mm/mmap.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/exec.c b/fs/exec.c index eb2a99793018d..816c7e347c9c1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ + mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); + mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; diff --git a/mm/mmap.c b/mm/mmap.c index ca54d36d203ae..79f4f8ae43ecd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -534,6 +534,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, { struct rb_node **__rb_link, *__rb_parent, *rb_prev; + mmap_assert_locked(mm); __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; @@ -2303,6 +2304,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct rb_node *rb_node; struct vm_area_struct *vma; + mmap_assert_locked(mm); /* Check the cache first. */ vma = vmacache_find(mm, addr); if (likely(vma)) From 07a3667ea227e43aa9b71e3b2769a7f5d27ae47c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 24 Aug 2021 09:59:21 +1000 Subject: [PATCH 144/365] add-mmap_assert_locked-annotations-to-find_vma-fix TOMOYO needs the same protection which get_arg_page() needs. Link: https://lkml.kernel.org/r/58bb6bf7-a57e-8a40-e74b-39584b415152@i-love.sakura.ne.jp Cc: Luigi Rizzo Cc: Kentaro Takeda Cc: James Morris Cc: "Serge E. Hallyn" Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- security/tomoyo/domain.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 98d985895ec8a..31af29f669d24 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -897,6 +897,9 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump) { struct page *page; +#ifdef CONFIG_MMU + int ret; +#endif /* dump->data is released by tomoyo_find_next_domain(). */ if (!dump->data) { @@ -909,11 +912,13 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, /* * This is called at execve() time in order to dig around * in the argv/environment of the new proceess - * (represented by bprm). 'current' is the process doing - * the execve(). + * (represented by bprm). */ - if (get_user_pages_remote(bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL, NULL) <= 0) + mmap_read_lock(bprm->mm); + ret = get_user_pages_remote(bprm->mm, pos, 1, + FOLL_FORCE, &page, NULL, NULL); + mmap_read_unlock(bprm->mm); + if (ret <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; From 5207c39c5efaa2d7aa40b1935892fe225997b199 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 24 Aug 2021 09:59:21 +1000 Subject: [PATCH 145/365] remap_file_pages: Use vma_lookup() instead of find_vma() Using vma_lookup() verifies the start address is contained in the found vma. This results in easier to read code. Link: https://lkml.kernel.org/r/20210817135234.1550204-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 79f4f8ae43ecd..52fed230dc21a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2994,14 +2994,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_vma(mm, start); + vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (start < vma->vm_start) - goto out; - if (start + size > vma->vm_end) { struct vm_area_struct *next; From 78103aabe95c5f80cc8e693723a70d9d8200caac Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Tue, 24 Aug 2021 09:59:21 +1000 Subject: [PATCH 146/365] mm/mremap: fix memory account on do_munmap() failure mremap will account the delta between new_len and old_len in vma_to_resize, and then call move_vma when expanding an existing memory mapping. In function move_vma, there are two scenarios when calling do_munmap: 1. move_page_tables from old_addr to new_addr success 2. move_page_tables from old_addr to new_addr fail In first scenario, it should account old_len if do_munmap fail, because the delta has already been accounted. In second scenario, new_addr/new_len will assign to old_addr/old_len if move_page_table fail, so do_munmap is try to unmap new_addr actually, if do_munmap fail, it should account the new_len, because error code will be return from move_vma, and delta will be unaccounted. What'more, because of new_len == old_len, so account old_len also is OK. In summary, account old_len will be correct if do_munmap fail. Link: https://lkml.kernel.org/r/20210717101942.120607-1-chenwandun@huawei.com Fixes: 51df7bcb6151 ("mm/mremap: account memory on do_munmap() failure") Signed-off-by: Chen Wandun Acked-by: Dmitry Safonov Cc: Kefeng Wang Cc: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 5989d39900204..badfe17ade1f0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -686,7 +686,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) - vm_acct_memory(new_len >> PAGE_SHIFT); + vm_acct_memory(old_len >> PAGE_SHIFT); excess = 0; } From ba875c6c47796c2904ca896027de123db2e36c91 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 24 Aug 2021 09:59:22 +1000 Subject: [PATCH 147/365] mm/mremap: don't account pages in vma_to_resize() All this vm_unacct_memory(charged) dance seems to complicate the life without a good reason. Furthermore, it seems not always done right on error-pathes in mremap_to(). And worse than that: this `charged' difference is sometimes double-accounted for growing MREMAP_DONTUNMAP mremap()s in move_vma(): if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) Let's not do this. Account memory in mremap() fast-path for growing VMAs or in move_vma() for actually moving things. The same simpler way as it's done by vm_stat_account(), but with a difference to call security_vm_enough_memory_mm() before copying/adjusting VMA. Originally noticed by Chen Wandun: https://lkml.kernel.org/r/20210717101942.120607-1-chenwandun@huawei.com Link: https://lkml.kernel.org/r/20210721131320.522061-1-dima@arista.com Fixes: e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov Acked-by: Brian Geffon Cc: Alexander Viro Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Chen Wandun Cc: Dan Carpenter Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kefeng Wang Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Minchan Kim Cc: Ralph Campbell Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Wei Yongjun Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mremap.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index badfe17ade1f0..c0b6c41b7b78f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -565,6 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, bool *locked, unsigned long flags, struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) { + long to_account = new_len - old_len; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; unsigned long vm_flags = vma->vm_flags; @@ -583,6 +584,9 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (unlikely(flags & MREMAP_DONTUNMAP)) + to_account = new_len; + if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); @@ -604,8 +608,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { - if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + if (vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT)) return -ENOMEM; } @@ -613,8 +617,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); if (!new_vma) { - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) - vm_unacct_memory(new_len >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT) + vm_unacct_memory(to_account >> PAGE_SHIFT); return -ENOMEM; } @@ -708,8 +712,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } static struct vm_area_struct *vma_to_resize(unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long flags, - unsigned long *p) + unsigned long old_len, unsigned long new_len, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -768,13 +771,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); - if (vma->vm_flags & VM_ACCOUNT) { - unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return ERR_PTR(-ENOMEM); - *p = charged; - } - return vma; } @@ -787,7 +783,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; unsigned long map_flags = 0; if (offset_in_page(new_addr)) @@ -830,7 +825,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -853,7 +848,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); if (IS_ERR_VALUE(ret)) - goto out1; + goto out; /* We got a new mapping */ if (!(flags & MREMAP_FIXED)) @@ -862,12 +857,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, uf_unmap); - if (!(offset_in_page(ret))) - goto out; - -out1: - vm_unacct_memory(charged); - out: return ret; } @@ -899,7 +888,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; bool locked = false; bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; @@ -981,7 +969,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -992,10 +980,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + long pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, pages)) { + ret = -ENOMEM; + goto out; + } + } if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { + vm_unacct_memory(pages); ret = -ENOMEM; goto out; } @@ -1034,10 +1030,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, &locked, flags, &uf, &uf_unmap); } out: - if (offset_in_page(ret)) { - vm_unacct_memory(charged); + if (offset_in_page(ret)) locked = false; - } if (downgraded) mmap_read_unlock(current->mm); else From ce2a5719348e91fb3005ce76d305c42c80f7c410 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 24 Aug 2021 09:59:22 +1000 Subject: [PATCH 148/365] mm/bootmem_info.c: mark __init on register_page_bootmem_info_section register_page_bootmem_info_section() is only called from __init functions, so mark it __init as well. Link: https://lkml.kernel.org/r/20210817042221.77172-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/bootmem_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 5b152dba7344f..f03f42f426f69 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -39,7 +39,7 @@ void put_page_bootmem(struct page *page) } #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void register_page_bootmem_info_section(unsigned long start_pfn) +static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; @@ -74,7 +74,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) } #else /* CONFIG_SPARSEMEM_VMEMMAP */ -static void register_page_bootmem_info_section(unsigned long start_pfn) +static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; From 4c2cd9de65979a82a6a70b6483989776cb28a943 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Tue, 24 Aug 2021 09:59:22 +1000 Subject: [PATCH 149/365] mm: sparse: pass section_nr to section_mark_present Patch series "mm: sparse: remove __section_nr() function", v4. This patch (of 3): With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts mem_section to section_nr could be costly since it iterates all section roots to check if the given mem_section is in its range. Since both callers of section_mark_present already know section_nr, let's also pass section_nr as well as mem_section in order to reduce costly translation. Link: https://lkml.kernel.org/r/20210707150212.855-1-ohoono.kwon@samsung.com Link: https://lkml.kernel.org/r/20210707150212.855-2-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Mike Rapoport Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/sparse.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 6326cdf36c4f2..8018ee7fcda52 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -187,10 +187,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * those loops early. */ unsigned long __highest_present_section_nr; -static void section_mark_present(struct mem_section *ms) +static void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) { - unsigned long section_nr = __section_nr(ms); - if (section_nr > __highest_present_section_nr) __highest_present_section_nr = section_nr; @@ -280,7 +279,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; - section_mark_present(ms); + __section_mark_present(ms, section); } } } @@ -934,7 +933,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); - section_mark_present(ms); + __section_mark_present(ms, section_nr); /* Align memmap to section boundary in the subsection case */ if (section_nr_to_pfn(section_nr) != start_pfn) From 331e90bd32978b07702366d1efd6477b6057121b Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Tue, 24 Aug 2021 09:59:22 +1000 Subject: [PATCH 150/365] mm: sparse: pass section_nr to find_memory_block With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts mem_section to section_nr could be costly since it iterates all section roots to check if the given mem_section is in its range. On the other hand, __nr_to_section() which converts section_nr to mem_section can be done in O(1). Let's pass section_nr instead of mem_section ptr to find_memory_block() in order to reduce needless iterations. Link: https://lkml.kernel.org/r/20210707150212.855-3-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/platforms/pseries/hotplug-memory.c | 4 +--- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9aa..d4f28ee4d5dce 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -211,13 +211,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) { unsigned long section_nr; - struct mem_section *mem_sect; struct memory_block *mem_block; section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - mem_sect = __nr_to_section(section_nr); - mem_block = find_memory_block(mem_sect); + mem_block = find_memory_block(section_nr); return mem_block; } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index aa31a21f33d7d..e3fd2dbf4eea8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -578,9 +578,9 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id) /* * Called under device_hotplug_lock. */ -struct memory_block *find_memory_block(struct mem_section *section) +struct memory_block *find_memory_block(unsigned long section_nr) { - unsigned long block_id = memory_block_id(__section_nr(section)); + unsigned long block_id = memory_block_id(section_nr); return find_memory_block_by_id(block_id); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a3..d9a0b61cd4329 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); -extern struct memory_block *find_memory_block(struct mem_section *); +extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); From 9684bd2ac3343afe1d80a4b1e0309fd2a1ad5546 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Tue, 24 Aug 2021 09:59:22 +1000 Subject: [PATCH 151/365] mm: sparse: remove __section_nr() function As the last users of __section_nr() are gone, let's remove unused function __section_nr(). Link: https://lkml.kernel.org/r/20210707150212.855-4-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 1 - mm/sparse.c | 26 -------------------------- 2 files changed, 27 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028f..8827f4d081d43 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1342,7 +1342,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } -extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); /* diff --git a/mm/sparse.c b/mm/sparse.c index 8018ee7fcda52..d85655242ed98 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -#ifdef CONFIG_SPARSEMEM_EXTREME -unsigned long __section_nr(struct mem_section *ms) -{ - unsigned long root_nr; - struct mem_section *root = NULL; - - for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { - root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); - if (!root) - continue; - - if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) - break; - } - - VM_BUG_ON(!root); - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); -} -#else -unsigned long __section_nr(struct mem_section *ms) -{ - return (unsigned long)(ms - mem_section[0]); -} -#endif - /* * During early boot, before section_mem_map is used for an actual * mem_map, we use section_mem_map to store the section's NUMA From bf1cbb42dbd214322f695fd9e438106e4938f844 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 24 Aug 2021 09:59:22 +1000 Subject: [PATCH 152/365] mm/sparse: set SECTION_NID_SHIFT to 6 Currently SECTION_NID_SHIFT is set to 3, which is incorrect because bit 3 and 4 can be overlapped by sub-field for early NID, and can be unexpectedly set on NUMA systems. There are a few non-critical issues related to this: - Having SECTION_TAINT_ZONE_DEVICE set for wrong sections forces pfn_to_online_page() through the slow path, but doesn't actually break the kernel. - A kdump generation tool like makedumpfile uses this field to calculate the physical address to read. So wrong bits can make the tool access to wrong address and fail to create kdump. This can be avoided by the tool, so it's not critical. To fix it, set SECTION_NID_SHIFT to 6 which is the minimum number of available bits of section flag field. Link: https://lkml.kernel.org/r/20210707045548.810271-1-naoya.horiguchi@linux.dev Fixes: 1f90a3477df3 ("mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions") Signed-off-by: Naoya Horiguchi Reported-by: Kazuhito Hagio Suggested-by: Dan Williams Acked-by: David Hildenbrand Cc: Oscar Salvador Cc: Wang Wensheng Cc: Rui Xiang Cc: Kazu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8827f4d081d43..59bad25ce78e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1364,7 +1364,7 @@ extern size_t mem_section_usage_size(void); #define SECTION_TAINT_ZONE_DEVICE (1UL<<4) #define SECTION_MAP_LAST_BIT (1UL<<5) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 3 +#define SECTION_NID_SHIFT 6 static inline struct page *__section_mem_map_addr(struct mem_section *section) { From 32a617e795a714d12cb732e8ad01e4246eaafc01 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 24 Aug 2021 09:59:23 +1000 Subject: [PATCH 153/365] include/linux/mmzone.h: avoid a warning in sparse memory support cppcheck warns that we're possibly losing information by shifting an int. It's a false positive, because we don't allow for a NUMA node ID that large, but if we ever change SECTION_NID_SHIFT, it could become a problem, and in any case this is usually a legitimate warning. Fix it by adding the necessary cast, which makes the compiler generate the right code. Link: https://lkml.kernel.org/r/YOya+aBZFFmC476e@casper.infradead.org Link: https://lkml.kernel.org/r/202107130348.6LsVT9Nc-lkp@intel.com Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index d85655242ed98..be7936e65b6a2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -117,7 +117,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) */ static inline unsigned long sparse_encode_early_nid(int nid) { - return (nid << SECTION_NID_SHIFT); + return ((unsigned long)nid << SECTION_NID_SHIFT); } static inline int sparse_early_nid(struct mem_section *section) From 11c7136554425d1f4b1ca50437751cfbe9be2855 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Tue, 24 Aug 2021 09:59:23 +1000 Subject: [PATCH 154/365] mm/sparse: clarify pgdat_to_phys Clarify pgdat_to_phys() by testing if pgdat == &contig_page_data when CONFIG_NUMA=n. We only expect contig_page_data in such case, so we use &contig_page_data directly instead of pgdat. No functional change intended when CONFIG_BUG_VM=n. Comment from Mark [1]: " ... and I reckon it'd be clearer and more robust to define pgdat_to_phys() in the same ifdefs as contig_page_data so that these, stay in-sync. e.g. have: | #ifdef CONFIG_NUMA | #define pgdat_to_phys(x) virt_to_phys(x) | #else /* CONFIG_NUMA */ | | extern struct pglist_data contig_page_data; | ... | #define pgdat_to_phys(x) __pa_symbol(&contig_page_data) | | #endif /* CONIFIG_NUMA */ " [1] https://lore.kernel.org/linux-arm-kernel/20210615131902.GB47121@C02TD0UTHF1T.local/ Link: https://lkml.kernel.org/r/20210723123342.26406-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/sparse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index be7936e65b6a2..fd29b3abffa02 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -321,7 +321,8 @@ size_t mem_section_usage_size(void) static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { #ifndef CONFIG_NUMA - return __pa_symbol(pgdat); + VM_BUG_ON(pgdat != &contig_page_data); + return __pa_symbol(&contig_page_data); #else return __pa(pgdat); #endif From 28d7e9fe82df6d5dc131c39ea2175260a4a16547 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 24 Aug 2021 09:59:23 +1000 Subject: [PATCH 155/365] mm/vmalloc: use batched page requests in bulk-allocator In case of simultaneous vmalloc allocations, for example it is 1GB and 12 CPUs my system is able to hit "BUG: soft lockup" for !CONFIG_PREEMPT kernel. [ 62.512621] RIP: 0010:__alloc_pages_bulk+0xa9f/0xbb0 [ 62.512628] Code: ff 8b 44 24 48 44 29 f8 83 f8 01 0f 84 ea fe ff ff e9 07 f6 ff ff 48 8b 44 24 60 48 89 28 e9 00 f9 ff ff fb 66 0f 1f 44 00 00 e8 fd ff ff 65 48 01 51 10 e9 3e fe ff ff 48 8b 44 24 78 4d 89 [ 62.512629] RSP: 0018:ffffa7bfc29ffd20 EFLAGS: 00000206 [ 62.512631] RAX: 0000000000000200 RBX: ffffcd5405421888 RCX: ffff8c36ffdeb928 [ 62.512632] RDX: 0000000000040000 RSI: ffffa896f06b2ff8 RDI: ffffcd5405421880 [ 62.512633] RBP: ffffcd5405421880 R08: 000000000000007d R09: ffffffffffffffff [ 62.512634] R10: ffffffff9d63c084 R11: 00000000ffffffff R12: ffff8c373ffaeb80 [ 62.512635] R13: ffff8c36ffdf65f8 R14: ffff8c373ffaeb80 R15: 0000000000040000 [ 62.512637] FS: 0000000000000000(0000) GS:ffff8c36ffdc0000(0000) knlGS:0000000000000000 [ 62.512638] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 62.512639] CR2: 000055c8e2fe8610 CR3: 0000000c13e10000 CR4: 00000000000006e0 [ 62.512641] Call Trace: [ 62.512646] __vmalloc_node_range+0x11c/0x2d0 [ 62.512649] ? full_fit_alloc_test+0x140/0x140 [test_vmalloc] [ 62.512654] __vmalloc_node+0x4b/0x70 [ 62.512656] ? fix_size_alloc_test+0x44/0x60 [test_vmalloc] [ 62.512659] fix_size_alloc_test+0x44/0x60 [test_vmalloc] [ 62.512662] test_func+0xe7/0x1f0 [test_vmalloc] [ 62.512666] ? fix_align_alloc_test+0x50/0x50 [test_vmalloc] [ 62.512668] kthread+0x11a/0x140 [ 62.512671] ? set_kthread_struct+0x40/0x40 [ 62.512672] ret_from_fork+0x22/0x30 To address this issue invoke a bulk-allocator many times until all pages are obtained, i.e. do batched page requests adding cond_resched() meanwhile to reschedule. Batched value is hard-coded and is 100 pages per call. Link: https://lkml.kernel.org/r/20210707182639.31282-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Hillf Danton Cc: Matthew Wilcox Cc: Mel Gorman Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d5cd528051496..24bc65f02d040 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2779,7 +2779,7 @@ EXPORT_SYMBOL_GPL(vmap_pfn); static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, - unsigned int order, unsigned long nr_pages, struct page **pages) + unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; @@ -2789,10 +2789,32 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * to fails, fallback to a single page allocator that is * more permissive. */ - if (!order) - nr_allocated = alloc_pages_bulk_array_node( - gfp, nid, nr_pages, pages); - else + if (!order) { + while (nr_allocated < nr_pages) { + unsigned int nr, nr_pages_request; + + /* + * A maximum allowed request is hard-coded and is 100 + * pages per call. That is done in order to prevent a + * long preemption off scenario in the bulk-allocator + * so the range is [1:100]. + */ + nr_pages_request = min(100U, nr_pages - nr_allocated); + + nr = alloc_pages_bulk_array_node(gfp, nid, + nr_pages_request, pages + nr_allocated); + + nr_allocated += nr; + cond_resched(); + + /* + * If zero or pages were obtained partly, + * fallback to a single page allocator. + */ + if (nr != nr_pages_request) + break; + } + } else /* * Compound pages required for remap_vmalloc_page if * high-order pages. From 08a141abc0cc64f6e64c13a16c0d15e06cd035f2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 24 Aug 2021 09:59:23 +1000 Subject: [PATCH 156/365] mm/vmalloc: remove gfpflags_allow_blocking() check Get rid of gfpflags_allow_blocking() check from the vmalloc() path as it is supposed to be sleepable anyway. Thus remove it from the alloc_vmap_area() as well as from the vm_area_alloc_pages(). Link: https://lkml.kernel.org/r/20210707182639.31282-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 24bc65f02d040..5dcb65dd9bf31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1479,6 +1479,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; + unsigned long freed; unsigned long addr; int purged = 0; int ret; @@ -1542,13 +1543,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, goto retry; } - if (gfpflags_allow_blocking(gfp_mask)) { - unsigned long freed = 0; - blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); - if (freed > 0) { - purged = 0; - goto retry; - } + freed = 0; + blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); + + if (freed > 0) { + purged = 0; + goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) @@ -2838,9 +2838,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; - if (gfpflags_allow_blocking(gfp)) - cond_resched(); - + cond_resched(); nr_allocated += 1U << order; } From fe05b7833c404142926b74baa163022975b29e54 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 24 Aug 2021 09:59:23 +1000 Subject: [PATCH 157/365] lib/test_vmalloc.c: add a new 'nr_pages' parameter In order to simulate different fixed sizes for vmalloc allocation introduce a new parameter that sets number of pages to be allocated for the "fix_size_alloc_test" test. By default 1 page is used unless a different number is specified over the new parameter. Link: https://lkml.kernel.org/r/20210710194151.21370-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_vmalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 01e9543de5664..e14993bc84d2d 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -35,6 +35,9 @@ __param(int, test_repeat_count, 1, __param(int, test_loop_count, 1000000, "Set test loop counter"); +__param(int, nr_pages, 0, + "Set number of pages for fix_size_alloc_test(default: 1)"); + __param(int, run_test_mask, INT_MAX, "Set tests specified in the mask.\n\n" "\t\tid: 1, name: fix_size_alloc_test\n" @@ -262,7 +265,7 @@ static int fix_size_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = vmalloc(3 * PAGE_SIZE); + ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE); if (!ptr) return -1; From 4f475fd4571794f1fd5fe4ffaad7a0fd411b2c7e Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Tue, 24 Aug 2021 09:59:24 +1000 Subject: [PATCH 158/365] mm/vmalloc: fix wrong behavior in vread commit f608788cd2d6 ("mm/vmalloc: use rb_tree instead of list for vread() lookups") use rb_tree instread of list to speed up lookup, but function __find_vmap_area is try to find a vmap_area that include target address, if target address is smaller than the leftmost node in vmap_area_root, it will return NULL, then vread will read nothing. This behavior is different from the primitive semantics. The correct way is find the first vmap_are that bigger than target addr, that is what function find_vmap_area_exceed_addr does. Link: https://lkml.kernel.org/r/20210714015959.3204871-1-chenwandun@huawei.com Fixes: f608788cd2d6 ("mm/vmalloc: use rb_tree instead of list for vread() lookups") Signed-off-by: Chen Wandun Reported-by: Hulk Robot Cc: Serapheim Dimitropoulos Cc: Uladzislau Rezki (Sony) Cc: Kefeng Wang Cc: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5dcb65dd9bf31..3824dc16ce1cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -787,6 +787,28 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +{ + struct vmap_area *va = NULL; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *tmp; + + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end > addr) { + va = tmp; + if (tmp->va_start <= addr) + break; + + n = n->rb_left; + } else + n = n->rb_right; + } + + return va; +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -3287,9 +3309,14 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = find_vmap_area_exceed_addr((unsigned long)addr); if (!va) goto finished; + + /* no intersects with alive vmap_area */ + if ((unsigned long)addr + count <= va->va_start) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; From 67be95c44a9c6334f70303fde017e172bc8d968b Mon Sep 17 00:00:00 2001 From: Woody Lin Date: Tue, 24 Aug 2021 09:59:24 +1000 Subject: [PATCH 159/365] mm/kasan: move kasan.fault to mm/kasan/report.c Move the boot parameter 'kasan.fault' from hw_tags.c to report.c, so it can support all KASAN modes - generic, and both tag-based. Link: https://lkml.kernel.org/r/20210713010536.3161822-1-woodylin@google.com Signed-off-by: Woody Lin Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/dev-tools/kasan.rst | 13 ++++++---- mm/kasan/hw_tags.c | 43 ------------------------------- mm/kasan/kasan.h | 1 - mm/kasan/report.c | 29 ++++++++++++++++++--- 4 files changed, 34 insertions(+), 52 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 83ec4a556c199..21dc03bc10a45 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -181,9 +181,16 @@ By default, KASAN prints a bug report only for the first invalid memory access. With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This effectively disables ``panic_on_warn`` for KASAN reports. +Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot +parameter can be used to control panic and reporting behaviour: + +- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN + report or also panic the kernel (default: ``report``). The panic happens even + if ``kasan_multi_shot`` is enabled. + Hardware tag-based KASAN mode (see the section about various modes below) is intended for use in production as a security mitigation. Therefore, it supports -boot parameters that allow disabling KASAN or controlling its features. +additional boot parameters that allow disabling KASAN or controlling features: - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). @@ -199,10 +206,6 @@ boot parameters that allow disabling KASAN or controlling its features. - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). -- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). The panic happens even - if ``kasan_multi_shot`` is enabled. - Implementation details ---------------------- diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 4ea8c368b5b8b..51903639e55fa 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -37,16 +37,9 @@ enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_ON, }; -enum kasan_arg_fault { - KASAN_ARG_FAULT_DEFAULT, - KASAN_ARG_FAULT_REPORT, - KASAN_ARG_FAULT_PANIC, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; -static enum kasan_arg_fault kasan_arg_fault __ro_after_init; /* Whether KASAN is enabled at all. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); @@ -59,9 +52,6 @@ EXPORT_SYMBOL_GPL(kasan_flag_async); /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -/* Whether to panic or print a report and disable tag checking on fault. */ -bool kasan_flag_panic __ro_after_init; - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -113,23 +103,6 @@ static int __init early_kasan_flag_stacktrace(char *arg) } early_param("kasan.stacktrace", early_kasan_flag_stacktrace); -/* kasan.fault=report/panic */ -static int __init early_kasan_fault(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "report")) - kasan_arg_fault = KASAN_ARG_FAULT_REPORT; - else if (!strcmp(arg, "panic")) - kasan_arg_fault = KASAN_ARG_FAULT_PANIC; - else - return -EINVAL; - - return 0; -} -early_param("kasan.fault", early_kasan_fault); - /* kasan_init_hw_tags_cpu() is called for each CPU. */ void kasan_init_hw_tags_cpu(void) { @@ -197,22 +170,6 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_fault) { - case KASAN_ARG_FAULT_DEFAULT: - /* - * Default to no panic on report. - * Do nothing, kasan_flag_panic keeps its default value. - */ - break; - case KASAN_ARG_FAULT_REPORT: - /* Do nothing, kasan_flag_panic keeps its default value. */ - break; - case KASAN_ARG_FAULT_PANIC: - /* Enable panic on report. */ - kasan_flag_panic = true; - break; - } - pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d739cdd1621ab..fa02c88b6948f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -37,7 +37,6 @@ static inline bool kasan_async_mode_enabled(void) #endif -extern bool kasan_flag_panic __ro_after_init; extern bool kasan_flag_async __ro_after_init; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8fff1825b22cd..884a950c70265 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -39,6 +39,31 @@ static unsigned long kasan_flags; #define KASAN_BIT_REPORTED 0 #define KASAN_BIT_MULTI_SHOT 1 +enum kasan_arg_fault { + KASAN_ARG_FAULT_DEFAULT, + KASAN_ARG_FAULT_REPORT, + KASAN_ARG_FAULT_PANIC, +}; + +static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT; + +/* kasan.fault=report/panic */ +static int __init early_kasan_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kasan_arg_fault = KASAN_ARG_FAULT_REPORT; + else if (!strcmp(arg, "panic")) + kasan_arg_fault = KASAN_ARG_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.fault", early_kasan_fault); + bool kasan_save_enable_multi_shot(void) { return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); @@ -102,10 +127,8 @@ static void end_report(unsigned long *flags, unsigned long addr) panic_on_warn = 0; panic("panic_on_warn set ...\n"); } -#ifdef CONFIG_KASAN_HW_TAGS - if (kasan_flag_panic) + if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); -#endif kasan_enable_current(); } From 9f386bf717678790619ef35cdf268d4dc71521a2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:24 +1000 Subject: [PATCH 160/365] kasan: test: rework kmalloc_oob_right Patch series "kasan: test: avoid crashing the kernel with HW_TAGS", v2. KASAN tests do out-of-bounds and use-after-free accesses. Running the tests works fine for the GENERIC mode, as it uses qurantine and redzones. But the HW_TAGS mode uses neither, and running the tests might crash the kernel. Rework the tests to avoid corrupting kernel memory. This patch (of 8): Rework kmalloc_oob_right() to do these bad access checks: 1. An unaligned access one byte past the requested kmalloc size (can only be detected by KASAN_GENERIC). 2. An aligned access into the first out-of-bounds granule that falls within the aligned kmalloc object. 3. Out-of-bounds access past the aligned kmalloc object. Test #3 deliberately uses a read access to avoid corrupting memory. Otherwise, this test might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. Link: https://lkml.kernel.org/r/cover.1628779805.git.andreyknvl@gmail.com Link: https://lkml.kernel.org/r/474aa8b7b538c6737a4c6d0090350af2e1776bef.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 8f7b0b2f6e11d..1bc3cdd2957f1 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -122,12 +122,28 @@ static void kasan_test_exit(struct kunit *test) static void kmalloc_oob_right(struct kunit *test) { char *ptr; - size_t size = 123; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 'x'); + /* + * An unaligned access past the requested kmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x'); + + /* + * An aligned access into the first out-of-bounds granule that falls + * within the aligned kmalloc object. + */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y'); + + /* Out-of-bounds access past the aligned kmalloc object. */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = + ptr[size + KASAN_GRANULE_SIZE + 5]); + kfree(ptr); } From 1b20e9ea1aa1cb365bc9da5e80ca2c5067865179 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:24 +1000 Subject: [PATCH 161/365] kasan: test: avoid writing invalid memory Multiple KASAN tests do writes past the allocated objects or writes to freed memory. Turn these writes into reads to avoid corrupting memory. Otherwise, these tests might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. Link: https://lkml.kernel.org/r/c3cd2a383e757e27dd9131635fc7d09a48a49cf9.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 1bc3cdd2957f1..c82a82eb53932 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -167,7 +167,7 @@ static void kmalloc_node_oob_right(struct kunit *test) ptr = kmalloc_node(size, GFP_KERNEL, 0); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); kfree(ptr); } @@ -203,7 +203,7 @@ static void kmalloc_pagealloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); } static void kmalloc_pagealloc_invalid_free(struct kunit *test) @@ -237,7 +237,7 @@ static void pagealloc_oob_right(struct kunit *test) ptr = page_address(pages); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); free_pages((unsigned long)ptr, order); } @@ -252,7 +252,7 @@ static void pagealloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); free_pages((unsigned long)ptr, order); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); } static void kmalloc_large_oob_right(struct kunit *test) @@ -514,7 +514,7 @@ static void kmalloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, *(ptr + 8) = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]); } static void kmalloc_uaf_memset(struct kunit *test) @@ -553,7 +553,7 @@ static void kmalloc_uaf2(struct kunit *test) goto again; } - KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]); KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); kfree(ptr2); @@ -700,7 +700,7 @@ static void ksize_unpoisons_memory(struct kunit *test) ptr[size] = 'x'; /* This one must. */ - KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y'); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]); kfree(ptr); } From 8a18c37a4ced6040f62f6845c534ea6ba012108c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:24 +1000 Subject: [PATCH 162/365] kasan: test: avoid corrupting memory via memset kmalloc_oob_memset_*() tests do writes past the allocated objects. As the result, they corrupt memory, which might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. Adjust the tests to only write memory within the aligned kmalloc objects. Also add a comment mentioning that memset tests are designed to touch both valid and invalid memory. Link: https://lkml.kernel.org/r/64fd457668a16e7b58d094f14a165f9d5170c5a9.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c82a82eb53932..db73bc9e3fa25 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -428,64 +428,70 @@ static void kmalloc_uaf_16(struct kunit *test) kfree(ptr1); } +/* + * Note: in the memset tests below, the written range touches both valid and + * invalid memory. This makes sure that the instrumentation does not only check + * the starting address but the whole range. + */ + static void kmalloc_oob_memset_2(struct kunit *test) { char *ptr; - size_t size = 8; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 7 + OOB_TAG_OFF, 0, 2)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); kfree(ptr); } static void kmalloc_oob_memset_4(struct kunit *test) { char *ptr; - size_t size = 8; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 5 + OOB_TAG_OFF, 0, 4)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); kfree(ptr); } - static void kmalloc_oob_memset_8(struct kunit *test) { char *ptr; - size_t size = 8; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 8)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); kfree(ptr); } static void kmalloc_oob_memset_16(struct kunit *test) { char *ptr; - size_t size = 16; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 16)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); kfree(ptr); } static void kmalloc_oob_in_memset(struct kunit *test) { char *ptr; - size_t size = 666; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + 5 + OOB_TAG_OFF)); + KUNIT_EXPECT_KASAN_FAIL(test, + memset(ptr, 0, size + KASAN_GRANULE_SIZE)); kfree(ptr); } From 99a89f3309ae8943e3877d34cfc2b7b48774bc78 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:25 +1000 Subject: [PATCH 163/365] kasan: test: disable kmalloc_memmove_invalid_size for HW_TAGS The HW_TAGS mode doesn't check memmove for negative size. As a result, the kmalloc_memmove_invalid_size test corrupts memory, which can result in a crash. Disable this test with HW_TAGS KASAN. Link: https://lkml.kernel.org/r/088733a06ac21eba29aa85b6f769d2abd74f9638.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index db73bc9e3fa25..1f533a7346d91 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -501,11 +501,17 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) size_t size = 64; volatile size_t invalid_size = -2; + /* + * Hardware tag-based mode doesn't check memmove for negative size. + * As a result, this test introduces a side-effect memory corruption, + * which can result in a crash. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); - KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); From 97420c52bed1e66bf674fb9c336c78c4565a1d14 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:25 +1000 Subject: [PATCH 164/365] kasan: test: only do kmalloc_uaf_memset for generic mode kmalloc_uaf_memset() writes to freed memory, which is only safe with the GENERIC mode (as it uses quarantine). For other modes, this test corrupts kernel memory, which might result in a crash. Only enable kmalloc_uaf_memset() for the GENERIC mode. Link: https://lkml.kernel.org/r/2e1c87b607b1292556cde3cab2764f108542b60c.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 1f533a7346d91..1dcba6dbfc973 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -534,6 +534,12 @@ static void kmalloc_uaf_memset(struct kunit *test) char *ptr; size_t size = 33; + /* + * Only generic KASAN uses quarantine, which is required to avoid a + * kernel memory corruption this test causes. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); From c1f4d699c75a9f53195e813e6e38db4ced1f1514 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:25 +1000 Subject: [PATCH 165/365] kasan: test: clean up ksize_uaf Some KASAN tests use global variables to store function returns values so that the compiler doesn't optimize away these functions. ksize_uaf() doesn't call any functions, so it doesn't need to use kasan_int_result. Use volatile accesses instead, to be consistent with other similar tests. Link: https://lkml.kernel.org/r/a1fc34faca4650f4a6e4dfb3f8d8d82c82eb953a.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 1dcba6dbfc973..30f2cde96e813 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -737,8 +737,8 @@ static void ksize_uaf(struct kunit *test) kfree(ptr); KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size)); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); } static void kasan_stack_oob(struct kunit *test) From d65154586ce5dfea425b306f1bb99c67e2f2c659 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:25 +1000 Subject: [PATCH 166/365] kasan: test: avoid corrupting memory in copy_user_test copy_user_test() does writes past the allocated object. As the result, it corrupts kernel memory, which might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. (Technically, this test can't yet be enabled with the HW_TAGS mode, but this will be implemented in the future.) Adjust the test to only write memory within the aligned kmalloc object. Link: https://lkml.kernel.org/r/19bf3a5112ee65b7db88dc731643b657b816c5e8.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan_module.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index f1017f345d6cc..fa73b9df0be40 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -15,13 +15,11 @@ #include "../mm/kasan/kasan.h" -#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) - static noinline void __init copy_user_test(void) { char *kmem; char __user *usermem; - size_t size = 10; + size_t size = 128 - KASAN_GRANULE_SIZE; int __maybe_unused unused; kmem = kmalloc(size, GFP_KERNEL); @@ -38,25 +36,25 @@ static noinline void __init copy_user_test(void) } pr_info("out-of-bounds in copy_from_user()\n"); - unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = copy_from_user(kmem, usermem, size + 1); pr_info("out-of-bounds in copy_to_user()\n"); - unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); + unused = copy_to_user(usermem, kmem, size + 1); pr_info("out-of-bounds in __copy_from_user()\n"); - unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = __copy_from_user(kmem, usermem, size + 1); pr_info("out-of-bounds in __copy_to_user()\n"); - unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); + unused = __copy_to_user(usermem, kmem, size + 1); pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); - unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1); pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); - unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1); pr_info("out-of-bounds in strncpy_from_user()\n"); - unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = strncpy_from_user(kmem, usermem, size + 1); vm_munmap((unsigned long)usermem, PAGE_SIZE); kfree(kmem); From c14bdc7bee4f6e093e32a9ee34a20cd01338957a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Aug 2021 09:59:25 +1000 Subject: [PATCH 167/365] kasan: test: avoid corrupting memory in kasan_rcu_uaf kasan_rcu_uaf() writes to freed memory via kasan_rcu_reclaim(), which is only safe with the GENERIC mode (as it uses quarantine). For other modes, this test corrupts kernel memory, which might result in a crash. Turn the write into a read. Link: https://lkml.kernel.org/r/b6f2c3bf712d2457c783fa59498225b66a634f62.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan_module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index fa73b9df0be40..7ebf433edef3b 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -71,7 +71,7 @@ static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) struct kasan_rcu_info, rcu); kfree(fp); - fp->i = 1; + ((volatile struct kasan_rcu_info *)fp)->i; } static noinline void __init kasan_rcu_uaf(void) From 2bb812f987f33cfc6982b23b8494cd37bcfd96fb Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:26 +1000 Subject: [PATCH 168/365] mm/page_alloc: always initialize memory map for the holes Patch series "mm: ensure consistency of memory map poisoning". Currently memory map allocation for FLATMEM case does not poison the struct pages regardless of CONFIG_PAGE_POISON setting. This happens because allocation of the memory map for FLATMEM and SPARSMEM use different memblock functions and those that are used for SPARSMEM case (namely memblock_alloc_try_nid_raw() and memblock_alloc_exact_nid_raw()) implicitly poison the allocated memory. Another side effect of this implicit poisoning is that early setup code that uses the same functions to allocate memory burns cycles for the memory poisoning even if it was not intended. These patches introduce memmap_alloc() wrapper that ensure that the memory map allocation is consistent for different memory models. This patch (of 4): Currently memory map for the holes is initialized only when SPARSEMEM memory model is used. Yet, even with FLATMEM there could be holes in the physical memory layout that have memory map entries. For instance, the memory reserved using e820 API on i386 or "reserved-memory" nodes in device tree would not appear in memblock.memory and hence the struct pages for such holes will be skipped during memory map initialization. These struct pages will be zeroed because the memory map for FLATMEM systems is allocated with memblock_alloc_node() that clears the allocated memory. While zeroed struct pages do not cause immediate problems, the correct behaviour is to initialize every page using __init_single_page(). Besides, enabling page poison for FLATMEM case will trigger PF_POISONED_CHECK() unless the memory map is properly initialized. Make sure init_unavailable_range() is called for both SPARSEMEM and FLATMEM so that struct pages representing memory holes would appear as PG_Reserved with any memory layout. Link: https://lkml.kernel.org/r/20210714123739.16493-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210714123739.16493-2-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 60a867a549bdc..bd2efd6287ce8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6642,7 +6642,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -6687,13 +6686,6 @@ static void __init init_unavailable_range(unsigned long spfn, pr_info("On node %d, zone %s: %lld pages in unavailable ranges", node, zone_names[zone], pgcnt); } -#else -static inline void init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) -{ -} -#endif static void __init memmap_init_zone_range(struct zone *zone, unsigned long start_pfn, From 551ed2e6f5d05691db7ee6a759e5b2d04c6a2938 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:26 +1000 Subject: [PATCH 169/365] mm-page_alloc-always-initialize-memory-map-for-the-holes-fix It appears that petalogix-ml605 memory starts at 0x50000, but microblaze's pfn_valid does not reject pfns < ARCH_PFN_OFFSET. Link: https://lkml.kernel.org/r/YQWW3RCE4eWBuMu/@kernel.org Tested-by: Guenter Roeck Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/microblaze/include/asm/page.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index ce550978f4fc2..4b8b2fa78fc5f 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET)) - +# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) From 28c6e7e5a91e3809052c7b41aa2b9e128da2d714 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:26 +1000 Subject: [PATCH 170/365] microblaze: simplify pte_alloc_one_kernel() The microblaze's implementation of pte_alloc_one_kernel() used memblock_alloc_try_nid_raw() along with clear_page() to allocated a zeroed page during early setup. Replace calls of these functions with a call to memblock_alloc_try_nid() that already returns zeroed page and respects the same allocation limits as memblock_alloc_try_nid_raw(). While on it drop early_get_page() wrapper that was only used in pte_alloc_one_kernel(). Link: https://lkml.kernel.org/r/20210714123739.16493-3-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/microblaze/include/asm/pgtable.h | 2 -- arch/microblaze/mm/init.c | 12 ------------ arch/microblaze/mm/pgtable.c | 17 ++++++++--------- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 71cd547655d9d..c136a01e467eb 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -443,8 +443,6 @@ extern int mem_init_done; asmlinkage void __init mmu_init(void); -void __init *early_get_page(void); - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index ab55c70380a5c..952f35b335b26 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -265,18 +265,6 @@ asmlinkage void __init mmu_init(void) dma_contiguous_reserve(memory_start + lowmem_size - 1); } -/* This is only called until mem_init is done. */ -void __init *early_get_page(void) -{ - /* - * Mem start + kernel_tlb -> here is limit - * because of mem mapping from head.S - */ - return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE, - MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb, - NUMA_NO_NODE); -} - void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 38ccb909bc9d9..c1833b159d3be 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr) __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; - if (mem_init_done) { - pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - } else { - pte = (pte_t *)early_get_page(); - if (pte) - clear_page(pte); - } - return pte; + if (mem_init_done) + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + else + return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, + memory_start + kernel_tlb, + NUMA_NO_NODE); } void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) From 30a333c706c4a180eb0c085aecb566fe9bc7a3de Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:26 +1000 Subject: [PATCH 171/365] mm: introduce memmap_alloc() to unify memory map allocation There are several places that allocate memory for the memory map: alloc_node_mem_map() for FLATMEM, sparse_buffer_init() and __populate_section_memmap() for SPARSEMEM. The memory allocated in the FLATMEM case is zeroed and it is never poisoned, regardless of CONFIG_PAGE_POISON setting. The memory allocated in the SPARSEMEM cases is not zeroed and it is implicitly poisoned inside memblock if CONFIG_PAGE_POISON is set. Introduce memmap_alloc() wrapper for memblock allocators that will be used for both FLATMEM and SPARSEMEM cases and will makei memory map zeroing and poisoning consistent for different memory models. Link: https://lkml.kernel.org/r/20210714123739.16493-4-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 4 ++++ mm/page_alloc.c | 24 ++++++++++++++++++++++-- mm/sparse.c | 6 ++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 31ff935b2547d..57e28261a3b17 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); +extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + int nid, bool exact_nid); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd2efd6287ce8..71ad97c96075c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6748,6 +6748,26 @@ static void __init memmap_init(void) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } +void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, int nid, bool exact_nid) +{ + void *ptr; + + if (exact_nid) + ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + else + ptr = memblock_alloc_try_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU @@ -7519,8 +7539,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node(size, SMP_CACHE_BYTES, - pgdat->node_id); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); diff --git a/mm/sparse.c b/mm/sparse.c index fd29b3abffa02..120bc8ea5293e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -436,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid_raw(size, size, addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", __func__, size, PAGE_SIZE, nid, &addr); @@ -464,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), - addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; } From 17931e564ba984880713460ab1dcc79233e6c17f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:26 +1000 Subject: [PATCH 172/365] memblock: stop poisoning raw allocations Functions memblock_alloc_exact_nid_raw() and memblock_alloc_try_nid_raw() are intended for early memory allocation without overhead of zeroing the allocated memory. Since these functions were used to allocate the memory map, they have ended up with addition of a call to page_init_poison() that poisoned the allocated memory when CONFIG_PAGE_POISON was set. Since the memory map is allocated using a dedicated memmep_alloc() function that takes care of the poisoning, remove page poisoning from the memblock_alloc_*_raw() functions. Link: https://lkml.kernel.org/r/20210714123739.16493-5-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memblock.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index de7b553baa500..a69449bffc8d2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1491,18 +1491,12 @@ void * __init memblock_alloc_exact_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, true); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + true); } /** @@ -1529,18 +1523,12 @@ void * __init memblock_alloc_try_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, false); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + false); } /** From 923ddb55aaf6e89b4445a81225e7cb1ee983d6ec Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Tue, 24 Aug 2021 09:59:27 +1000 Subject: [PATCH 173/365] mm/page_alloc.c: fix 'zone_id' may be used uninitialized in this function warning When compiling with -Werror, cc1 will warn that 'zone_id' may be used uninitialized in this function warning. Initialize the zone_id as 0. Its safe to assume that if the code reaches this point it has at least one numa node with memory, so no need for an assertion before init_unavilable_range. Link: https://lkml.kernel.org/r/20210716210336.1114114-1-npache@redhat.com Fixes: 122e093c1734 ("mm/page_alloc: fix memory map initialization for descending nodes") Signed-off-by: Nico Pache Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 71ad97c96075c..04021e37120e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6715,7 +6715,7 @@ static void __init memmap_init(void) { unsigned long start_pfn, end_pfn; unsigned long hole_pfn = 0; - int i, j, zone_id, nid; + int i, j, zone_id = 0, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { struct pglist_data *node = NODE_DATA(nid); From c7106f38b261bebf338878cfd3e05275e5e59e94 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:27 +1000 Subject: [PATCH 174/365] mm/page_alloc: make alloc_node_mem_map() __init rather than __ref alloc_node_mem_map() is never only called from free_area_init_node() that is an __init function. Make the actual alloc_node_mem_map() also __init and its stub version static inline. Link: https://lkml.kernel.org/r/20210716064124.31865-1-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04021e37120e6..05079b4ae7bc5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7515,7 +7515,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } #ifdef CONFIG_FLATMEM -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -7561,7 +7561,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif } #else -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT From 81b4f3bb4acbfb0c87d936ddeb749c4ecebef6cb Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:27 +1000 Subject: [PATCH 175/365] mm/page_alloc.c: use in_task() Obsoleted in_intrrupt() include task context with disabled BH, it's better to use in_task() instead. Link: https://lkml.kernel.org/r/877caa99-1994-5545-92d2-d0bb2e394182@virtuozzo.com Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05079b4ae7bc5..eaa936efad7e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4211,7 +4211,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; show_mem(filter, nodemask); @@ -4697,7 +4697,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * comment for __cpuset_node_allowed(). */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(current)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_HARDER; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@ -5157,7 +5157,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. */ - if (!in_interrupt() && !ac->nodemask) + if (in_task() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; else *alloc_flags |= ALLOC_CPUSET; From 0cda9b0410bebc97d36de9a3119a12868718ef28 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:27 +1000 Subject: [PATCH 176/365] mm/hwpoison: remove unneeded variable unmap_success Patch series "Cleanups and fixup for hwpoison" This series contains cleanups to remove unneeded variable, fix some obsolete comments and so on. Also we fix potential pte_unmap_unlock on wrong pte. More details can be found in the respective changelogs. This patch (of 4): unmap_success is used to indicate whether page is successfully unmapped but it's irrelated with ZONE_DEVICE page and unmap_success is always true here. Remove this unneeded one. Link: https://lkml.kernel.org/r/20210814105131.48814-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210814105131.48814-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 470400cc75136..9793c78c37774 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1518,7 +1518,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { struct page *page = pfn_to_page(pfn); - const bool unmap_success = true; unsigned long size = 0; struct to_kill *tk; LIST_HEAD(tokill); @@ -1590,7 +1589,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, start = (page->index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(page->mapping, start, size, 0); } - kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); + kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags); rc = 0; unlock: dax_unlock_page(page, cookie); From 01da0ad257ab7c166716edda22deeee2a3830c2e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:27 +1000 Subject: [PATCH 177/365] mm/hwpoison: fix potential pte_unmap_unlock pte error If the first pte is equal to poisoned_pfn, i.e. check_hwpoisoned_entry() return 1, the wrong ptep - 1 would be passed to pte_unmap_unlock(). Link: https://lkml.kernel.org/r/20210814105131.48814-3-linmiaohe@huawei.com Fixes: ad9c59c24095 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9793c78c37774..224bd0be223c5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -632,7 +632,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, { struct hwp_walk *hwp = (struct hwp_walk *)walk->private; int ret = 0; - pte_t *ptep; + pte_t *ptep, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmdp, walk->vma); @@ -645,14 +645,15 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, if (pmd_trans_unstable(pmdp)) goto out; - ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, + addr, &ptl); for (; addr != end; ptep++, addr += PAGE_SIZE) { ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; } - pte_unmap_unlock(ptep - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); out: cond_resched(); return ret; From f139fae2fee54ecf6893687519840fa30733e88d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:27 +1000 Subject: [PATCH 178/365] mm/hwpoison: change argument struct page **hpagep to *hpage It's unnecessary to pass in a struct page **hpagep because it's never modified. Changing to use *hpage to simplify the code. Link: https://lkml.kernel.org/r/20210814105131.48814-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 224bd0be223c5..102caf78aae8f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1271,14 +1271,13 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) * the pages and send SIGBUS to the processes if the data was dirty. */ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page **hpagep) + int flags, struct page *hpage) { enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int kill = 1, forcekill; - struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); /* @@ -1503,7 +1502,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - if (!hwpoison_user_mappings(p, pfn, flags, &head)) { + if (!hwpoison_user_mappings(p, pfn, flags, head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1783,7 +1782,7 @@ int memory_failure(unsigned long pfn, int flags) * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, &p)) { + if (!hwpoison_user_mappings(p, pfn, flags, p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto unlock_page; From a73176262c2def2f7bfdc11a2e333efceb9d236c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:28 +1000 Subject: [PATCH 179/365] mm/hwpoison: fix some obsolete comments Since commit cb731d6c62bb ("vmscan: per memory cgroup slab shrinkers"), shrink_node_slabs is renamed to drop_slab_node. And doit argument is changed to forcekill since commit 6751ed65dc66 ("x86/mce: Fix siginfo_t->si_addr value for non-recoverable memory faults"). Link: https://lkml.kernel.org/r/20210814105131.48814-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 102caf78aae8f..f83a2af0af180 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -296,7 +296,7 @@ void shake_page(struct page *p, int access) } /* - * Only call shrink_node_slabs here (which would also shrink + * Only call drop_slab_node here (which would also shrink * other caches) if access is not potentially fatal. */ if (access) @@ -391,8 +391,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, /* * Kill the processes that have been collected earlier. * - * Only do anything when DOIT is set, otherwise just free the list - * (this is used for clean pages which do not need killing) + * Only do anything when FORCEKILL is set, otherwise just free the + * list (this is used for clean pages which do not need killing) * Also when FAIL is set do a force kill because something went * wrong earlier. */ From 181f3a058cb4363e255c79c553f51f35ea70d178 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 24 Aug 2021 09:59:28 +1000 Subject: [PATCH 180/365] mm: hwpoison: don't drop slab caches for offlining non-LRU page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current implementation of soft offline, if non-LRU page is met, all the slab caches will be dropped to free the page then offline. But if the page is not slab page all the effort is wasted in vain. Even though it is a slab page, it is not guaranteed the page could be freed at all. However the side effect and cost is quite high. It does not only drop the slab caches, but also may drop a significant amount of page caches which are associated with inode caches. It could make the most workingset gone in order to just offline a page. And the offline is not guaranteed to succeed at all, actually I really doubt the success rate for real life workload. Furthermore the worse consequence is the system may be locked up and unusable since the page cache release may incur huge amount of works queued for memcg release. Actually we ran into such unpleasant case in our production environment. Firstly, the workqueue of memory_failure_work_func is locked up as below: BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 53s! Showing busy workqueues and worker pools: workqueue events: flags=0x0   pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=14/256 refcnt=15     in-flight: 409271:memory_failure_work_func     pending: kfree_rcu_work, kfree_rcu_monitor, kfree_rcu_work, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, drain_local_stock, kfree_rcu_work workqueue mm_percpu_wq: flags=0x8   pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2     pending: vmstat_update workqueue cgroup_destroy: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 refcnt=12072 pending: css_release_work_fn There were over 12K css_release_work_fn queued, and this caused a few lockups due to the contention of worker pool lock with IRQ disabled, for example: NMI watchdog: Watchdog detected hard LOCKUP on cpu 1 Modules linked in: amd64_edac_mod edac_mce_amd crct10dif_pclmul crc32_pclmul ghash_clmulni_intel xt_DSCP iptable_mangle kvm_amd bpfilter vfat fat acpi_ipmi i2c_piix4 usb_storage ipmi_si k10temp i2c_core ipmi_devintf ipmi_msghandler acpi_cpufreq sch_fq_codel xfs libcrc32c crc32c_intel mlx5_core mlxfw nvme xhci_pci ptp nvme_core pps_core xhci_hcd CPU: 1 PID: 205500 Comm: kworker/1:0 Tainted: G L 5.10.32-t1.el7.twitter.x86_64 #1 Hardware name: TYAN F5AMT /z /S8026GM2NRE-CGN, BIOS V8.030 03/30/2021 Workqueue: events memory_failure_work_func RIP: 0010:queued_spin_lock_slowpath+0x41/0x1a0 Code: 41 f0 0f ba 2f 08 0f 92 c0 0f b6 c0 c1 e0 08 89 c2 8b 07 30 e4 09 d0 a9 00 01 ff ff 75 1b 85 c0 74 0e 8b 07 84 c0 74 08 f3 90 <8b> 07 84 c0 75 f8 b8 01 00 00 00 66 89 07 c3 f6 c4 01 75 04 c6 47 RSP: 0018:ffff9b2ac278f900 EFLAGS: 00000002 RAX: 0000000000480101 RBX: ffff8ce98ce71800 RCX: 0000000000000084 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8ce98ce6a140 RBP: 00000000000284c8 R08: ffffd7248dcb6808 R09: 0000000000000000 R10: 0000000000000003 R11: ffff9b2ac278f9b0 R12: 0000000000000001 R13: ffff8cb44dab9c00 R14: ffffffffbd1ce6a0 R15: ffff8cacaa37f068 FS: 0000000000000000(0000) GS:ffff8ce98ce40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fcf6e8cb000 CR3: 0000000a0c60a000 CR4: 0000000000350ee0 Call Trace: __queue_work+0xd6/0x3c0 queue_work_on+0x1c/0x30 uncharge_batch+0x10e/0x110 mem_cgroup_uncharge_list+0x6d/0x80 release_pages+0x37f/0x3f0 __pagevec_release+0x1c/0x50 __invalidate_mapping_pages+0x348/0x380 ? xfs_alloc_buftarg+0xa4/0x120 [xfs] inode_lru_isolate+0x10a/0x160 ? iput+0x1d0/0x1d0 __list_lru_walk_one+0x7b/0x170 ? iput+0x1d0/0x1d0 list_lru_walk_one+0x4a/0x60 prune_icache_sb+0x37/0x50 super_cache_scan+0x123/0x1a0 do_shrink_slab+0x10c/0x2c0 shrink_slab+0x1f1/0x290 drop_slab_node+0x4d/0x70 soft_offline_page+0x1ac/0x5b0 ? dev_mce_log+0xee/0x110 ? notifier_call_chain+0x39/0x90 memory_failure_work_func+0x6a/0x90 process_one_work+0x19e/0x340 ? process_one_work+0x340/0x340 worker_thread+0x30/0x360 ? process_one_work+0x340/0x340 kthread+0x116/0x130 The lockup made the machine is quite unusable. And it also made the most workingset gone, the reclaimabled slab caches were reduced from 12G to 300MB, the page caches were decreased from 17G to 4G. But the most disappointing thing is all the effort doesn't make the page offline, it just returns: soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 () It seems the aggressive behavior for non-LRU page didn't pay back, so it doesn't make too much sense to keep it considering the terrible side effect. Link: https://lkml.kernel.org/r/20210819054116.266126-1-shy828301@gmail.com Signed-off-by: Yang Shi Reported-by: David Mackey Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 +- mm/hwpoison-inject.c | 2 +- mm/memory-failure.c | 18 ++++++++---------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 35bbac32b6f69..11c38550627c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3110,7 +3110,7 @@ extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); +extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 1ae1ebc2b9b16..aff4d27ec2352 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -30,7 +30,7 @@ static int hwpoison_inject(void *data, u64 val) if (!hwpoison_filter_enable) goto inject; - shake_page(hpage, 0); + shake_page(hpage); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f83a2af0af180..5decacb86b9fc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -282,9 +282,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) /* * Unknown page type encountered. Try to check whether it can turn PageLRU by - * lru_add_drain_all, or a free page by reclaiming slabs when possible. + * lru_add_drain_all. */ -void shake_page(struct page *p, int access) +void shake_page(struct page *p) { if (PageHuge(p)) return; @@ -296,11 +296,9 @@ void shake_page(struct page *p, int access) } /* - * Only call drop_slab_node here (which would also shrink - * other caches) if access is not potentially fatal. + * TODO: Could shrink slab caches here if a lightweight range-based + * shrinker will be available. */ - if (access) - drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -1205,7 +1203,7 @@ static int get_any_page(struct page *p, unsigned long flags) * page, retry. */ if (pass++ < 3) { - shake_page(p, 1); + shake_page(p); goto try_again; } ret = -EIO; @@ -1222,7 +1220,7 @@ static int get_any_page(struct page *p, unsigned long flags) */ if (pass++ < 3) { put_page(p); - shake_page(p, 1); + shake_page(p); count_increased = false; goto try_again; } @@ -1369,7 +1367,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage, 0); + shake_page(hpage); /* * Now that the dirty bit has been propagated to the @@ -1723,7 +1721,7 @@ int memory_failure(unsigned long pfn, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p, 0); + shake_page(p); lock_page(p); From 6556acab3567a37b4c8b2e0527791817931e6194 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 24 Aug 2021 09:59:28 +1000 Subject: [PATCH 181/365] doc: hwpoison: correct the support for hugepage The hwpoison support for huge page, both hugetlb and THP, has been in kernel for a while, the statement in document is obsolete, correct it. Link: https://lkml.kernel.org/r/20210819054116.266126-2-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Naoya Horiguchi Cc: Oscar Salvador Cc: David Hildenbrand Cc: David Mackey Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/hwpoison.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst index a5c884293dac4..89b5f7a520771 100644 --- a/Documentation/vm/hwpoison.rst +++ b/Documentation/vm/hwpoison.rst @@ -180,7 +180,6 @@ Limitations =========== - Not all page types are supported and never will. Most kernel internal objects cannot be recovered, only LRU pages for now. -- Right now hugepage support is missing. --- Andi Kleen, Oct 2009 From bdd57b87a748f564ba16f517049d1c8b981726cd Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 24 Aug 2021 09:59:28 +1000 Subject: [PATCH 182/365] mm: hwpoison: dump page for unhandlable page Currently just very simple message is shown for unhandlable page, e.g. non-LRU page, like: soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 () It is not very helpful for further debug, calling dump_page() could show more useful information. Calling dump_page() in get_any_page() in order to not duplicate the call in a couple of different places. It may be called with pcp disabled and holding memory hotplug lock, it should be not a big deal since hwpoison handler is not called very often. Link: https://lkml.kernel.org/r/20210819054116.266126-3-shy828301@gmail.com Signed-off-by: Yang Shi Suggested-by: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: David Hildenbrand Cc: David Mackey Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5decacb86b9fc..d1e6f4d948e59 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1228,6 +1228,9 @@ static int get_any_page(struct page *p, unsigned long flags) ret = -EIO; } out: + if (ret == -EIO) + dump_page(p, "hwpoison: unhandlable page"); + return ret; } From 66adf03f7c684efb79f559d1c33b2f2533167d2f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 24 Aug 2021 09:59:28 +1000 Subject: [PATCH 183/365] hugetlb: simplify prep_compound_gigantic_page ref count racing code Code in prep_compound_gigantic_page waits for a rcu grace period if it notices a temporarily inflated ref count on a tail page. This was due to the identified potential race with speculative page cache references which could only last for a rcu grace period. This is overly complicated as this situation is VERY unlikely to ever happen. Instead, just quickly return an error. Also, only print a warning in prep_compound_gigantic_page instead of multiple callers. Link: https://lkml.kernel.org/r/20210809184832.18342-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8ea35ba6699f2..3658398c835d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1657,16 +1657,14 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order) * cache adding could take a ref on a 'to be' tail page. * We need to respect any increased ref count, and only set * the ref count to zero if count is currently 1. If count - * is not 1, we call synchronize_rcu in the hope that a rcu - * grace period will cause ref count to drop and then retry. - * If count is still inflated on retry we return an error and - * must discard the pages. + * is not 1, we return an error. An error return indicates + * the set of pages can not be converted to a gigantic page. + * The caller who allocated the pages should then discard the + * pages using the appropriate free interface. */ if (!page_ref_freeze(p, 1)) { - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - synchronize_rcu(); - if (!page_ref_freeze(p, 1)) - goto out_error; + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + goto out_error; } set_page_count(p, 0); set_compound_head(p, page); @@ -1830,7 +1828,6 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, retry = true; goto retry; } - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); return NULL; } } @@ -2828,8 +2825,8 @@ static void __init gather_bootmem_prealloc(void) prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* add to the hugepage allocator */ } else { + /* VERY unlikely inflated ref count on a tail page */ free_gigantic_page(page, huge_page_order(h)); - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); } /* From 4f8d3fe8a0b7bb035df44b046c276d4810df7826 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 24 Aug 2021 09:59:29 +1000 Subject: [PATCH 184/365] hugetlb: drop ref count earlier after page allocation When discussing the possibility of inflated page ref counts, Muuchun Song pointed out this potential issue [1]. It is true that any code could potentially take a reference on a compound page after allocation and before it is converted to and put into use as a hugetlb page. Specifically, this could be done by any users of get_page_unless_zero. There are three areas of concern within hugetlb code. 1) When adding pages to the pool. In this case, new pages are allocated added to the pool by calling put_page to invoke the hugetlb destructor (free_huge_page). If there is an inflated ref count on the page, it will not be immediately added to the free list. It will only be added to the free list when the temporary ref count is dropped. This is deemed acceptable and will not be addressed. 2) A page is allocated for immediate use normally as a surplus page or migration target. In this case, the user of the page will also hold a reference. There is no issue as this is just like normal page ref counting. 3) A page is allocated and MUST be added to the free list to satisfy a reservation. One such example is gather_surplus_pages as pointed out by Muchun in [1]. More specifically, this case covers callers of enqueue_huge_page where the page reference count must be zero. This patch covers this third case. Three routines call enqueue_huge_page when the page reference count could potentially be inflated. They are: gather_surplus_pages, alloc_and_dissolve_huge_page and add_hugetlb_page. add_hugetlb_page is called on error paths when a huge page can not be freed due to the inability to allocate vmemmap pages. In this case, the temporairly inflated ref count is not an issue. When the ref is dropped the appropriate action will be taken. Instead of VM_BUG_ON if the ref count does not drop to zero, simply return. In gather_surplus_pages and alloc_and_dissolve_huge_page the caller expects a page (or pages) to be put on the free lists. In this case we must ensure there are no temporary ref counts. We do this by calling put_page_testzero() earlier and not using pages without a zero ref count. The temporary page flag (HPageTemporary) is used in such cases so that as soon as the inflated ref count is dropped the page will be freed. [1] https://lore.kernel.org/linux-mm/CAMZfGtVMn3daKrJwZMaVOGOaJU+B4dS--x_oPmGQMD=c=QNGEg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210809184832.18342-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 100 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 22 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3658398c835d0..8f97321396cc8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1072,6 +1072,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) int nid = page_to_nid(page); lockdep_assert_held(&hugetlb_lock); + VM_BUG_ON_PAGE(page_count(page), page); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1399,11 +1401,20 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, SetHPageVmemmapOptimized(page); /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the last reference. + * This page is about to be managed by the hugetlb allocator and + * should have no users. Drop our reference, and check for others + * just in case. */ zeroed = put_page_testzero(page); - VM_BUG_ON_PAGE(!zeroed, page); + if (!zeroed) + /* + * It is VERY unlikely soneone else has taken a ref on + * the page. In this case, we simply return as the + * hugetlb destructor (free_huge_page) will be called + * when this other ref is dropped. + */ + return; + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } @@ -2017,9 +2028,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) + int nid, nodemask_t *nmask, bool zero_ref) { struct page *page = NULL; + bool retry = false; if (hstate_is_gigantic(h)) return NULL; @@ -2029,6 +2041,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); +retry: page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2046,11 +2059,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; - } else { - h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; } + if (zero_ref) { + /* + * Caller requires a page with zero ref count. + * We will drop ref count here. If someone else is holding + * a ref, the page will be freed when they drop it. Abuse + * temporary page flag to accomplish this. + */ + SetHPageTemporary(page); + if (!put_page_testzero(page)) { + /* + * Unexpected inflated ref count on freshly allocated + * huge. Retry once. + */ + pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); + spin_unlock_irq(&hugetlb_lock); + if (retry) + return NULL; + + retry = true; + goto retry; + } + ClearHPageTemporary(page); + } + + h->surplus_huge_pages++; + h->surplus_huge_pages_node[page_to_nid(page)]++; + out_unlock: spin_unlock_irq(&hugetlb_lock); @@ -2092,7 +2129,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); mpol_cond_put(mpol); return page; @@ -2164,7 +2201,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL); + NUMA_NO_NODE, NULL, true); if (!page) { alloc_ok = false; break; @@ -2205,24 +2242,20 @@ static int gather_surplus_pages(struct hstate *h, long delta) /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - int zeroed; - if ((--needed) < 0) break; - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - zeroed = put_page_testzero(page); - VM_BUG_ON_PAGE(!zeroed, page); + /* Add the page to the hugetlb allocator */ enqueue_huge_page(h, page); } free: spin_unlock_irq(&hugetlb_lock); - /* Free unnecessary surplus pages to the buddy allocator */ + /* + * Free unnecessary surplus pages to the buddy allocator. + * Pages have no ref count, call free_huge_page directly. + */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) - put_page(page); + free_huge_page(page); spin_lock_irq(&hugetlb_lock); return ret; @@ -2531,6 +2564,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); + bool alloc_retry = false; struct page *new_page; int ret = 0; @@ -2541,9 +2575,30 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * the pool. This simplifies and let us do most of the processing * under the lock. */ +alloc_retry: new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; + /* + * If all goes well, this page will be directly added to the free + * list in the pool. For this the ref count needs to be zero. + * Attempt to drop now, and retry once if needed. It is VERY + * unlikely there is another ref on the page. + * + * If someone else has a reference to the page, it will be freed + * when they drop their ref. Abuse temporary page flag to accomplish + * this. Retry once if there is an inflated ref count. + */ + SetHPageTemporary(new_page); + if (!put_page_testzero(new_page)) { + if (alloc_retry) + return -EBUSY; + + alloc_retry = true; + goto alloc_retry; + } + ClearHPageTemporary(new_page); + __prep_new_huge_page(h, new_page); retry: @@ -2583,11 +2638,10 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, remove_hugetlb_page(h, old_page, false); /* - * Reference count trick is needed because allocator gives us - * referenced page but the pool requires pages with 0 refcount. + * Ref count on new page is already zero as it was dropped + * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - page_ref_dec(new_page); enqueue_huge_page(h, new_page); /* @@ -2601,6 +2655,8 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, free_new: spin_unlock_irq(&hugetlb_lock); + /* Page has a zero ref count, but needs a ref to be freed */ + set_page_refcounted(new_page); update_and_free_page(h, new_page, false); return ret; From 225ee545af17ed09a70ceadd88cd375ece51034d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 24 Aug 2021 09:59:29 +1000 Subject: [PATCH 185/365] hugetlb: before freeing hugetlb page set dtor to appropriate value When removing a hugetlb page from the pool the ref count is set to one (as the free page has no ref count) and compound page destructor is set to NULL_COMPOUND_DTOR. Since a subsequent call to free the hugetlb page will call __free_pages for non-gigantic pages and free_gigantic_page for gigantic pages the destructor is not used. However, consider the following race with code taking a speculative reference on the page: Thread 0 Thread 1 -------- -------- remove_hugetlb_page set_page_refcounted(page); set_compound_page_dtor(page, NULL_COMPOUND_DTOR); get_page_unless_zero(page) __update_and_free_page __free_pages(page, huge_page_order(h)); /* Note that __free_pages() will simply drop the reference to the page. */ put_page(page) __put_compound_page() destroy_compound_page NULL_COMPOUND_DTOR BUG: kernel NULL pointer dereference, address: 0000000000000000 To address this race, set the dtor to the normal compound page dtor for non-gigantic pages. The dtor for gigantic pages does not matter as gigantic pages are changed from a compound page to 'just a group of pages' before freeing. Hence, the destructor is not used. Link: https://lkml.kernel.org/r/20210809184832.18342-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Michal Hocko Cc: Oscar Salvador Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Mina Almasry Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8f97321396cc8..dd1c1e7d970b7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1370,8 +1370,28 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]--; } + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_page will turn the compound page + * into a simple group of pages. After this the destructor does not + * apply. + * + * This handles the case where more than one ref is held when and + * after update_and_free_page is called. + */ set_page_refcounted(page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (hstate_is_gigantic(h)) + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + else + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; From b487f4612476bc5102dd8301e0bc80691a37b9a0 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 24 Aug 2021 09:59:29 +1000 Subject: [PATCH 186/365] userfaultfd: change mmap_changing to atomic Patch series "userfaultfd: minor bug fixes". Three unrelated bug fixes. The first two addresses possible issues (not too theoretical ones), but I did not encounter them in practice. The third patch addresses a test bug that causes the test to fail on my system. It has been sent before as part of a bigger RFC. This patch (of 3): mmap_changing is currently a boolean variable, which is set and cleared without any lock that protects against concurrent modifications. mmap_chanign is supposed to mark whether userfaultfd page-faults handling should be retried since mappings are undergoing a change. However, concurrent calls, for instance to madvise(MADV_DONTNEED), might cause mmap_changing to be false, although the remove event was still not read (hence acknowledged) by the user. Change mmap_changing to atomic_t and increase/decrease appropriately. Add a debug assertion to see whether mmap_changing is negative. Link: https://lkml.kernel.org/r/20210808020724.1022515-1-namit@vmware.com Link: https://lkml.kernel.org/r/20210808020724.1022515-2-namit@vmware.com Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") Signed-off-by: Nadav Amit Cc: Mike Rapoport Cc: Peter Xu Cc: Axel Rasmussen Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/userfaultfd.c | 25 +++++++++++++------------ include/linux/userfaultfd_k.h | 8 ++++---- mm/userfaultfd.c | 15 ++++++++------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5c2d806e6ae53..29a3016f16c9c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -74,7 +74,7 @@ struct userfaultfd_ctx { /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ - bool mmap_changing; + atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -623,7 +623,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: - WRITE_ONCE(ctx->mmap_changing, false); + atomic_dec(&ctx->mmap_changing); + VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -669,12 +670,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); - WRITE_ONCE(octx->mmap_changing, true); + atomic_inc(&octx->mmap_changing); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1700,7 +1701,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1757,7 +1758,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1807,7 +1808,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range range; bool mode_wp, mode_dontwake; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) return -EAGAIN; user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; @@ -1855,7 +1856,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -2087,7 +2088,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 331d2ccf0bccb..33cea484d1ad7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode); + atomic_t *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, - bool *mmap_changing); + atomic_t *mmap_changing); extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, bool *mmap_changing); + unsigned long len, atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, - bool enable_wp, bool *mmap_changing); + bool enable_wp, atomic_t *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0e2132834bc7d..7a90084155343 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -483,7 +483,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mcopy_mode, - bool *mmap_changing, + atomic_t *mmap_changing, __u64 mode) { struct vm_area_struct *dst_vma; @@ -517,7 +517,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; /* @@ -650,28 +650,29 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode) + atomic_t *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, mmap_changing, 0); } ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, bool *mmap_changing) + unsigned long len, bool enable_wp, + atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; pgprot_t newprot; @@ -694,7 +695,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; err = -ENOENT; From e3952476cd5037433b69dc9966e933d69e23f046 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 24 Aug 2021 09:59:29 +1000 Subject: [PATCH 187/365] userfaultfd: prevent concurrent API initialization userfaultfd assumes that the enabled features are set once and never changed after UFFDIO_API ioctl succeeded. However, currently, UFFDIO_API can be called concurrently from two different threads, succeed on both threads and leave userfaultfd's features in non-deterministic state. Theoretically, other uffd operations (ioctl's and page-faults) can be dispatched while adversely affected by such changes of features. Moreover, the writes to ctx->state and ctx->features are not ordered, which can - theoretically, again - let userfaultfd_ioctl() think that userfaultfd API completed, while the features are still not initialized. To avoid races, it is arguably best to get rid of ctx->state. Since there are only 2 states, record the API initialization in ctx->features as the uppermost bit and remove ctx->state. Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor") Signed-off-by: Nadav Amit Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jens Axboe Cc: Mike Rapoport Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/userfaultfd.c | 91 +++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 29a3016f16c9c..003f0d31743eb 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -enum userfaultfd_state { - UFFD_STATE_WAIT_API, - UFFD_STATE_RUNNING, -}; - /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. @@ -69,8 +64,6 @@ struct userfaultfd_ctx { unsigned int flags; /* features requested from the userspace */ unsigned int features; - /* state machine */ - enum userfaultfd_state state; /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ @@ -104,6 +97,14 @@ struct userfaultfd_wake_range { unsigned long len; }; +/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -667,7 +668,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; - ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; atomic_set(&ctx->mmap_changing, 0); @@ -944,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->fd_wqh, wait); - switch (ctx->state) { - case UFFD_STATE_WAIT_API: + if (!userfaultfd_is_initialized(ctx)) return EPOLLERR; - case UFFD_STATE_RUNNING: - /* - * poll() never guarantees that read won't block. - * userfaults can be waken before they're read(). - */ - if (unlikely(!(file->f_flags & O_NONBLOCK))) - return EPOLLERR; - /* - * lockless access to see if there are pending faults - * __pollwait last action is the add_wait_queue but - * the spin_unlock would allow the waitqueue_active to - * pass above the actual list_add inside - * add_wait_queue critical section. So use a full - * memory barrier to serialize the list_add write of - * add_wait_queue() with the waitqueue_active read - * below. - */ - ret = 0; - smp_mb(); - if (waitqueue_active(&ctx->fault_pending_wqh)) - ret = EPOLLIN; - else if (waitqueue_active(&ctx->event_wqh)) - ret = EPOLLIN; - return ret; - default: - WARN_ON_ONCE(1); + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) return EPOLLERR; - } + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; } static const struct file_operations userfaultfd_fops; @@ -1170,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, int no_wait = file->f_flags & O_NONBLOCK; struct inode *inode = file_inode(file); - if (ctx->state == UFFD_STATE_WAIT_API) + if (!userfaultfd_is_initialized(ctx)) return -EINVAL; for (;;) { @@ -1909,9 +1904,10 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) static inline unsigned int uffd_ctx_features(__u64 user_features) { /* - * For the current set of features the bits just coincide + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. */ - return (unsigned int)user_features; + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } /* @@ -1924,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, { struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; + unsigned int ctx_features; int ret; __u64 features; - ret = -EINVAL; - if (ctx->state != UFFD_STATE_WAIT_API) - goto out; ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; @@ -1953,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; - ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ - ctx->features = uffd_ctx_features(features); + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + ret = 0; out: return ret; @@ -1972,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, int ret = -EINVAL; struct userfaultfd_ctx *ctx = file->private_data; - if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) return -EINVAL; switch(cmd) { @@ -2086,7 +2084,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; - ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; From 47e8a2a69c3a546bb762663455f69cef4da31548 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 24 Aug 2021 09:59:29 +1000 Subject: [PATCH 188/365] selftests/vm/userfaultfd: wake after copy failure When userfaultfd copy-ioctl fails since the PTE already exists, an -EEXIST error is returned and the faulting thread is not woken. The current userfaultfd test does not wake the faulting thread in such case. The assumption is presumably that another thread set the PTE through copy/wp ioctl and would wake the faulting thread or that alternatively the fault handler would realize there is no need to "must_wait" and continue. This is not necessarily true. There is an assumption that the "must_wait" tests in handle_userfault() are sufficient to provide definitive answer whether the offending PTE is populated or not. However, userfaultfd_must_wait() test is lockless. Consequently, concurrent calls to ptep_modify_prot_start(), for instance, can clear the PTE and can cause userfaultfd_must_wait() to wrongly assume it is not populated and a wait is needed. There are therefore 3 options: (1) Change the tests to wake on copy failure. (2) Wake faulting thread unconditionally on zero/copy ioctls before returning -EEXIST. (3) Change the userfaultfd_must_wait() to hold locks. This patch took the first approach, but the others are valid solutions with different tradeoffs. Link: https://lkml.kernel.org/r/20210808020724.1022515-4-namit@vmware.com Signed-off-by: Nadav Amit Cc: Jens Axboe Cc: Andrea Arcangeli Cc: Peter Xu Cc: Alexander Viro Cc: Axel Rasmussen Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/userfaultfd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 2ea438e6b8b1f..10ab56c2484ae 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -566,6 +566,18 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, } } +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + static int __copy_page(int ufd, unsigned long offset, bool retry) { struct uffdio_copy uffdio_copy; @@ -585,6 +597,7 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) if (uffdio_copy.copy != -EEXIST) err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); } else if (uffdio_copy.copy != page_size) { err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); } else { From 928b9872c473891cb40527f0d20b1c0e3062abfe Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:30 +1000 Subject: [PATCH 189/365] mm/numa: automatically generate node migration order Patch series "Migrate Pages in lieu of discard", v11. We're starting to see systems with more and more kinds of memory such as Intel's implementation of persistent memory. Let's say you have a system with some DRAM and some persistent memory. Today, once DRAM fills up, reclaim will start and some of the DRAM contents will be thrown out. Allocations will, at some point, start falling over to the slower persistent memory. That has two nasty properties. First, the newer allocations can end up in the slower persistent memory. Second, reclaimed data in DRAM are just discarded even if there are gobs of space in persistent memory that could be used. This patchset implements a solution to these problems. At the end of the reclaim process in shrink_page_list() just before the last page refcount is dropped, the page is migrated to persistent memory instead of being dropped. While I've talked about a DRAM/PMEM pairing, this approach would function in any environment where memory tiers exist. This is not perfect. It "strands" pages in slower memory and never brings them back to fast DRAM. Huang Ying has follow-on work which repurposes NUMA balancing to promote hot pages back to DRAM. This is also all based on an upstream mechanism that allows persistent memory to be onlined and used as if it were volatile: http://lkml.kernel.org/r/20190124231441.37A4A305@viggo.jf.intel.com With that, the DRAM and PMEM in each socket will be represented as 2 separate NUMA nodes, with the CPUs sit in the DRAM node. So the general inter-NUMA demotion mechanism introduced in the patchset can migrate the cold DRAM pages to the PMEM node. We have tested the patchset with the postgresql and pgbench. On a 2-socket server machine with DRAM and PMEM, the kernel with the patchset can improve the score of pgbench up to 22.1% compared with that of the DRAM only + disk case. This comes from the reduced disk read throughput (which reduces up to 70.8%). == Open Issues == * Memory policies and cpusets that, for instance, restrict allocations to DRAM can be demoted to PMEM whenever they opt in to this new mechanism. A cgroup-level API to opt-in or opt-out of these migrations will likely be required as a follow-on. * Could be more aggressive about where anon LRU scanning occurs since it no longer necessarily involves I/O. get_scan_count() for instance says: "If we have no swap space, do not bother scanning anon pages" This patch (of 9): Prepare for the kernel to auto-migrate pages to other memory nodes with a node migration table. This allows creating single migration target for each NUMA node to enable the kernel to do NUMA page migrations instead of simply discarding colder pages. A node with no target is a "terminal node", so reclaim acts normally there. The migration target does not fundamentally _need_ to be a single node, but this implementation starts there to limit complexity. When memory fills up on a node, memory contents can be automatically migrated to another node. The biggest problems are knowing when to migrate and to where the migration should be targeted. The most straightforward way to generate the "to where" list would be to follow the page allocator fallback lists. Those lists already tell us if memory is full where to look next. It would also be logical to move memory in that order. But, the allocator fallback lists have a fatal flaw: most nodes appear in all the lists. This would potentially lead to migration cycles (A->B, B->A, A->B, ...). Instead of using the allocator fallback lists directly, keep a separate node migration ordering. But, reuse the same data used to generate page allocator fallback in the first place: find_next_best_node(). This means that the firmware data used to populate node distances essentially dictates the ordering for now. It should also be architecture-neutral since all NUMA architectures have a working find_next_best_node(). RCU is used to allow lock-less read of node_demotion[] and prevent demotion cycles been observed. If multiple reads of node_demotion[] are performed, a single rcu_read_lock() must be held over all reads to ensure no cycles are observed. Details are as follows. === What does RCU provide? === Imagine a simple loop which walks down the demotion path looking for the last node: terminal_node = start_node; while (node_demotion[terminal_node] != NUMA_NO_NODE) { terminal_node = node_demotion[terminal_node]; } The initial values are: node_demotion[0] = 1; node_demotion[1] = NUMA_NO_NODE; and are updated to: node_demotion[0] = NUMA_NO_NODE; node_demotion[1] = 0; What guarantees that the cycle is not observed: node_demotion[0] = 1; node_demotion[1] = 0; and would loop forever? With RCU, a rcu_read_lock/unlock() can be placed around the loop. Since the write side does a synchronize_rcu(), the loop that observed the old contents is known to be complete before the synchronize_rcu() has completed. RCU, combined with disable_all_migrate_targets(), ensures that the old migration state is not visible by the time __set_migration_target_nodes() is called. === What does READ_ONCE() provide? === READ_ONCE() forbids the compiler from merging or reordering successive reads of node_demotion[]. This ensures that any updates are *eventually* observed. Consider the above loop again. The compiler could theoretically read the entirety of node_demotion[] into local storage (registers) and never go back to memory, and *permanently* observe bad values for node_demotion[]. Note: RCU does not provide any universal compiler-ordering guarantees: https://lore.kernel.org/lkml/20150921204327.GH4029@linux.vnet.ibm.com/ This code is unused for now. It will be called later in the series. Link: https://lkml.kernel.org/r/20210721063926.3024591-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-2-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Wei Xu Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 5 ++ mm/migrate.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 2 +- 3 files changed, 222 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 57e28261a3b17..cf3cb933eba3f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -543,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } +static inline int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + return NUMA_NO_NODE; +} #endif extern int hwpoison_filter(struct page *p); diff --git a/mm/migrate.c b/mm/migrate.c index 7e240437e7d9a..57aeb9b491da6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1099,6 +1099,80 @@ static int __unmap_and_move(struct page *page, struct page *newpage, return rc; } + +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { 1, // Node 0 migrates to 1 + * 2, // Node 1 migrates to 2 + * -1, // Node 2 does not migrate + * 4, // Node 3 migrates to 4 + * 5, // Node 4 migrates to 5 + * -1} // Node 5 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +static int node_demotion[MAX_NUMNODES] __read_mostly = + {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * @returns: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + int target; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target = READ_ONCE(node_demotion[node]); + rcu_read_unlock(); + + return target; +} + /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -2982,3 +3056,145 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ + +/* Disable reclaim-based migration. */ +static void __disable_all_migrate_targets(void) +{ + int node; + + for_each_online_node(node) + node_demotion[node] = NUMA_NO_NODE; +} + +static void disable_all_migrate_targets(void) +{ + __disable_all_migrate_targets(); + + /* + * Ensure that the "disable" is visible across the system. + * Readers will see either a combination of before+disable + * state or disable+after. They will never see before and + * after state together. + * + * The before+after state together might have cycles and + * could cause readers to do things like loop until this + * function finishes. This ensures they can only see a + * single "bad" read and would, for instance, only loop + * once. + */ + synchronize_rcu(); +} + +/* + * Find an automatic demotion target for 'node'. + * Failing here is OK. It might just indicate + * being at the end of a chain. + */ +static int establish_migrate_target(int node, nodemask_t *used) +{ + int migration_target; + + /* + * Can not set a migration target on a + * node with it already set. + * + * No need for READ_ONCE() here since this + * in the write path for node_demotion[]. + * This should be the only thread writing. + */ + if (node_demotion[node] != NUMA_NO_NODE) + return NUMA_NO_NODE; + + migration_target = find_next_best_node(node, used); + if (migration_target == NUMA_NO_NODE) + return NUMA_NO_NODE; + + node_demotion[node] = migration_target; + + return migration_target; +} + +/* + * When memory fills up on a node, memory contents can be + * automatically migrated to another node instead of + * discarded at reclaim. + * + * Establish a "migration path" which will start at nodes + * with CPUs and will follow the priorities used to build the + * page allocator zonelists. + * + * The difference here is that cycles must be avoided. If + * node0 migrates to node1, then neither node1, nor anything + * node1 migrates to can migrate to node0. + * + * This function can run simultaneously with readers of + * node_demotion[]. However, it can not run simultaneously + * with itself. Exclusion is provided by memory hotplug events + * being single-threaded. + */ +static void __set_migration_target_nodes(void) +{ + nodemask_t next_pass = NODE_MASK_NONE; + nodemask_t this_pass = NODE_MASK_NONE; + nodemask_t used_targets = NODE_MASK_NONE; + int node; + + /* + * Avoid any oddities like cycles that could occur + * from changes in the topology. This will leave + * a momentary gap when migration is disabled. + */ + disable_all_migrate_targets(); + + /* + * Allocations go close to CPUs, first. Assume that + * the migration path starts at the nodes with CPUs. + */ + next_pass = node_states[N_CPU]; +again: + this_pass = next_pass; + next_pass = NODE_MASK_NONE; + /* + * To avoid cycles in the migration "graph", ensure + * that migration sources are not future targets by + * setting them in 'used_targets'. Do this only + * once per pass so that multiple source nodes can + * share a target node. + * + * 'used_targets' will become unavailable in future + * passes. This limits some opportunities for + * multiple source nodes to share a destination. + */ + nodes_or(used_targets, used_targets, this_pass); + for_each_node_mask(node, this_pass) { + int target_node = establish_migrate_target(node, &used_targets); + + if (target_node == NUMA_NO_NODE) + continue; + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } + /* + * 'next_pass' contains nodes which became migration + * targets in this pass. Make additional passes until + * no more migrations targets are available. + */ + if (!nodes_empty(next_pass)) + goto again; +} + +/* + * For callers that do not hold get_online_mems() already. + */ +__maybe_unused // <- temporay to prevent warnings during bisects +static void set_migration_target_nodes(void) +{ + get_online_mems(); + __set_migration_target_nodes(); + put_online_mems(); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa936efad7e4..cafdca874e0d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6157,7 +6157,7 @@ static int node_load[MAX_NUMNODES]; * * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ -static int find_next_best_node(int node, nodemask_t *used_node_mask) +int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; From d4e8d5a2d6da8d5c246a499cda07f5d696f27fa2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:30 +1000 Subject: [PATCH 190/365] mm/migrate: update node demotion order on hotplug events Reclaim-based migration is attempting to optimize data placement in memory based on the system topology. If the system changes, so must the migration ordering. The implementation is conceptually simple and entirely unoptimized. On any memory or CPU hotplug events, assume that a node was added or removed and recalculate all migration targets. This ensures that the node_demotion[] array is always ready to be used in case the new reclaim mode is enabled. This recalculation is far from optimal, most glaringly that it does not even attempt to figure out the hotplug event would have some *actual* effect on the demotion order. But, given the expected paucity of hotplug events, this should be fine. Link: https://lkml.kernel.org/r/20210721063926.3024591-2-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-3-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 57aeb9b491da6..0c12af203b684 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -3057,6 +3058,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ +#if defined(CONFIG_MEMORY_HOTPLUG) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { @@ -3191,10 +3193,96 @@ static void __set_migration_target_nodes(void) /* * For callers that do not hold get_online_mems() already. */ -__maybe_unused // <- temporay to prevent warnings during bisects static void set_migration_target_nodes(void) { get_online_mems(); __set_migration_target_nodes(); put_online_mems(); } + +/* + * React to hotplug events that might affect the migration targets + * like events that online or offline NUMA nodes. + * + * The ordering is also currently dependent on which nodes have + * CPUs. That means we need CPU on/offline notification too. + */ +static int migration_online_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +static int migration_offline_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +/* + * This leaves migrate-on-reclaim transiently disabled between + * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs + * whether reclaim-based migration is enabled or not, which + * ensures that the user can turn reclaim-based migration at + * any time without needing to recalculate migration targets. + * + * These callbacks already hold get_online_mems(). That is why + * __set_migration_target_nodes() can be used as opposed to + * set_migration_target_nodes(). + */ +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Make sure there are not transient states where + * an offline node is a migration target. This + * will leave migration disabled until the offline + * completes and the MEM_OFFLINE case below runs. + */ + disable_all_migrate_targets(); + break; + case MEM_OFFLINE: + case MEM_ONLINE: + /* + * Recalculate the target nodes once the node + * reaches its final state (online or offline). + */ + __set_migration_target_nodes(); + break; + case MEM_CANCEL_OFFLINE: + /* + * MEM_GOING_OFFLINE disabled all the migration + * targets. Reenable them. + */ + __set_migration_target_nodes(); + break; + case MEM_GOING_ONLINE: + case MEM_CANCEL_ONLINE: + break; + } + + return notifier_from_errno(0); +} + +static int __init migrate_on_reclaim_init(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim", + migration_online_cpu, + migration_offline_cpu); + /* + * In the unlikely case that this fails, the automatic + * migration targets may become suboptimal for nodes + * where N_CPU changes. With such a small impact in a + * rare case, do not bother trying to do anything special. + */ + WARN_ON(ret < 0); + + hotplug_memory_notifier(migrate_on_reclaim_callback, 100); + return 0; +} +late_initcall(migrate_on_reclaim_init); +#endif /* CONFIG_MEMORY_HOTPLUG */ From 1762e8ddd437fa575240f96d122d22521a1e14bd Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 24 Aug 2021 09:59:30 +1000 Subject: [PATCH 191/365] mm/migrate: enable returning precise migrate_pages() success count Under normal circumstances, migrate_pages() returns the number of pages migrated. In error conditions, it returns an error code. When returning an error code, there is no way to know how many pages were migrated or not migrated. Make migrate_pages() return how many pages are demoted successfully for all cases, including when encountering errors. Page reclaim behavior will depend on this in subsequent patches. Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Oscar Salvador [optional parameter] Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/migrate.h | 5 +++-- mm/compaction.c | 2 +- mm/gup.c | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/migrate.c | 11 ++++++++--- mm/page_alloc.c | 2 +- 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 23dadf7aeba89..8ab88d46318ea 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason); + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); @@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping, static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, - int reason) + int reason, unsigned int *ret_succeeded) { return -ENOSYS; } static inline struct page *alloc_migration_target(struct page *page, unsigned long private) diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5d..61fb64f47a069 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION); + MR_COMPACTION, NULL); trace_mm_compaction_migratepages(cc->nr_migratepages, err, &cc->migratepages); diff --git a/mm/gup.c b/mm/gup.c index 76ac930f1dd11..7a406d79bd2e6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1772,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN); + MR_LONGTERM_PIN, NULL); if (ret && !list_empty(&movable_page_list)) putback_movable_pages(&movable_page_list); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d1e6f4d948e59..7436ae383f531 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page) if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { bool release = !huge; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 86c3af79e874e..4c527a80b6c99 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (nodes_empty(nmask)) node_set(mtc.nid, nmask); ret = migrate_pages(&source, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { list_for_each_entry(page, &source, lru) { if (__ratelimit(&migrate_rs)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e32360e902744..939eabcaf488f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } @@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 0c12af203b684..ae923e9b88743 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1429,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. + * @ret_succeeded: Set to the number of pages migrated successfully if + * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. @@ -1439,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; int thp_retry = 1; @@ -1594,6 +1596,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + if (ret_succeeded) + *ret_succeeded = nr_succeeded; + return rc; } @@ -1663,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; @@ -2178,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cafdca874e0d7..f95e1d2386a13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8990,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migration_target, - NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless From 084ba40193810e3c89f9c86b7179f72759b170f3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:30 +1000 Subject: [PATCH 192/365] mm/migrate: demote pages during reclaim This is mostly derived from a patch from Yang Shi: https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/ Add code to the reclaim path (shrink_page_list()) to "demote" data to another NUMA node instead of discarding the data. This always avoids the cost of I/O needed to read the page back in and sometimes avoids the writeout cost when the page is dirty. A second pass through shrink_page_list() will be made if any demotions fail. This essentially falls back to normal reclaim behavior in the case that demotions fail. Previous versions of this patch may have simply failed to reclaim pages which were eligible for demotion but were unable to be demoted in practice. For some cases, for example, MADV_PAGEOUT, the pages are always discarded instead of demoted to follow the kernel API definition. Because MADV_PAGEOUT is defined as freeing specified pages regardless in which tier they are. Note: This just adds the start of infrastructure for migration. It is actually disabled next to the FIXME in migrate_demote_page_ok(). Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/migrate.h | 9 ++++ include/trace/events/migrate.h | 3 +- mm/vmscan.c | 85 ++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8ab88d46318ea..326250996b4e0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -28,6 +28,7 @@ enum migrate_reason { MR_NUMA_MISPLACED, MR_CONTIG_RANGE, MR_LONGTERM_PIN, + MR_DEMOTION, MR_TYPES }; @@ -167,6 +168,14 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int next_demotion_node(int node); + +#else /* CONFIG_MIGRATION disabled: */ + +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_MIGRATION */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 9fb2a3bbcdfb9..779f3fad9ecd5 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -21,7 +21,8 @@ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ - EMe(MR_LONGTERM_PIN, "longterm_pin") + EM( MR_LONGTERM_PIN, "longterm_pin") \ + EMe(MR_DEMOTION, "demotion") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/vmscan.c b/mm/vmscan.c index b51fdd75d717c..b7f0442557e6d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + /* Allocation order */ s8 order; @@ -518,6 +522,17 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } +static bool can_demote_anon_pages(int nid, struct scan_control *sc) +{ + if (sc->no_demotion) + return false; + if (next_demotion_node(nid) == NUMA_NO_NODE) + return false; + + // FIXME: actually enable this later in the series + return false; +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -1270,6 +1285,49 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +static struct page *alloc_demote_page(struct page *page, unsigned long node) +{ + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_THISNODE | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = node + }; + + return alloc_migration_target(page, (unsigned long)&mtc); +} + +/* + * Take pages on @demote_list and attempt to demote them to + * another node. Pages which are not demoted are left on + * @demote_pages. + */ +static unsigned int demote_page_list(struct list_head *demote_pages, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + int err; + + if (list_empty(demote_pages)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + /* Demotion ignores all cpuset and mempolicy settings */ + err = migrate_pages(demote_pages, alloc_demote_page, NULL, + target_nid, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + return nr_succeeded; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1281,12 +1339,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); + LIST_HEAD(demote_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; + bool do_demote_pass; memset(stat, 0, sizeof(*stat)); cond_resched(); + do_demote_pass = can_demote_anon_pages(pgdat->node_id, sc); +retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1435,6 +1497,17 @@ static unsigned int shrink_page_list(struct list_head *page_list, ; /* try to reclaim the page below */ } + /* + * Before reclaiming the page, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !PageTransHuge(page))) { + list_add(&page->lru, &demote_pages); + unlock_page(page); + continue; + } + /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. @@ -1686,6 +1759,17 @@ static unsigned int shrink_page_list(struct list_head *page_list, list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + /* 'page_list' is always empty here */ + + /* Migrate pages selected for demotion */ + nr_reclaimed += demote_page_list(&demote_pages, pgdat); + /* Pages that could not be demoted are still in @demote_pages */ + if (!list_empty(&demote_pages)) { + /* Pages which failed to demoted go back on @page_list for retry: */ + list_splice_init(&demote_pages, page_list); + do_demote_pass = false; + goto retry; + } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -2333,6 +2417,7 @@ unsigned long reclaim_pages(struct list_head *page_list) .may_writepage = 1, .may_unmap = 1, .may_swap = 1, + .no_demotion = 1, }; noreclaim_flag = memalloc_noreclaim_save(); From 998b778e0f72ec87f752066da1060211cb282bbc Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:30 +1000 Subject: [PATCH 193/365] mm-migrate-demote-pages-during-reclaim-v11 Rename can_demote_anon_pages() to can_demote() to reflect the fact that the function is for anon and file pages. Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-4-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Zi Yan Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b7f0442557e6d..f26ea4a16d018 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -522,7 +522,7 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } -static bool can_demote_anon_pages(int nid, struct scan_control *sc) +static bool can_demote(int nid, struct scan_control *sc) { if (sc->no_demotion) return false; @@ -1346,7 +1346,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, memset(stat, 0, sizeof(*stat)); cond_resched(); - do_demote_pass = can_demote_anon_pages(pgdat->node_id, sc); + do_demote_pass = can_demote(pgdat->node_id, sc); retry: while (!list_empty(page_list)) { From 0f5b9397241e6a1156e7734cc2efe2a1263b16c1 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 24 Aug 2021 09:59:31 +1000 Subject: [PATCH 194/365] mm/vmscan: add page demotion counter Account the number of demoted pages. Add pgdemote_kswapd and pgdemote_direct VM counters showed in /proc/vmstat. [ daveh: - __count_vm_events() a bit, and made them look at the THP size directly rather than getting data from migrate_pages() ] Link: https://lkml.kernel.org/r/20210721063926.3024591-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-6-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Oscar Salvador Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/vm_event_item.h | 2 ++ mm/vmscan.c | 5 +++++ mm/vmstat.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ae0dd1948c2b0..a185cc75ff52a 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, diff --git a/mm/vmscan.c b/mm/vmscan.c index f26ea4a16d018..0ba897029ccb1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1325,6 +1325,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages, target_nid, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); + if (current_is_kswapd()) + __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); + else + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + return nr_succeeded; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b0534e068166c..ec5a2e789dd2e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1217,6 +1217,8 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgdemote_kswapd", + "pgdemote_direct", "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", From 76b593d7f22f6d613c58f8b1b231a029a75a6fe6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:31 +1000 Subject: [PATCH 195/365] mm/vmscan: add helper for querying ability to age anonymous pages Anonymous pages are kept on their own LRU(s). These lists could theoretically always be scanned and maintained. But, without swap, there is currently nothing the kernel can *do* with the results of a scanned, sorted LRU for anonymous pages. A check for '!total_swap_pages' currently serves as a valid check as to whether anonymous LRUs should be maintained. However, another method will be added shortly: page demotion. Abstract out the 'total_swap_pages' checks into a helper, give it a logically significant name, and check for the possibility of page demotion. Link: https://lkml.kernel.org/r/20210715055145.195411-7-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Greg Thelen Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0ba897029ccb1..5a64a01583cdb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2741,6 +2741,21 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } } +/* + * Anonymous LRU management is a waste if there is + * ultimately no way to reclaim the memory. + */ +static bool can_age_anon_pages(struct pglist_data *pgdat, + struct scan_control *sc) +{ + /* Aging the anon LRU is valuable if swap is present: */ + if (total_swap_pages > 0) + return true; + + /* Also valuable if anon pages can be demoted: */ + return can_demote_anon_pages(pgdat->node_id, sc); +} + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -2850,7 +2865,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -3685,7 +3701,7 @@ static void age_active_anon(struct pglist_data *pgdat, struct mem_cgroup *memcg; struct lruvec *lruvec; - if (!total_swap_pages) + if (!can_age_anon_pages(pgdat, sc)) return; lruvec = mem_cgroup_lruvec(NULL, pgdat); From 629b606dfbd42f8bd77a7d9fdba19869f6660ea0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:31 +1000 Subject: [PATCH 196/365] mm-vmscan-add-helper-for-querying-ability-to-age-anonymous-pages-v11 Rename can_demote_anon_pages() to can_demote() to reflect the fact that the function is for anon and file pages. Link: https://lkml.kernel.org/r/20210715055145.195411-7-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-6-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Greg Thelen Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: Zi Yan Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a64a01583cdb..832226cca7821 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2753,7 +2753,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return true; /* Also valuable if anon pages can be demoted: */ - return can_demote_anon_pages(pgdat->node_id, sc); + return can_demote(pgdat->node_id, sc); } static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) From b3be79729a0d20143409b59fe020d5073374d582 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 24 Aug 2021 09:59:31 +1000 Subject: [PATCH 197/365] mm/vmscan: Consider anonymous pages without swap Reclaim anonymous pages if a migration path is available now that demotion provides a non-swap recourse for reclaiming anon pages. Note that this check is subtly different from the can_age_anon_pages() checks. This mechanism checks whether a specific page in a specific context can actually be reclaimed, given current swap space and cgroup limits. can_age_anon_pages() is a much simpler and more preliminary check which just says whether there is a possibility of future reclaim. Link: https://lkml.kernel.org/r/20210715055145.195411-8-ying.huang@intel.com Cc: Keith Busch Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 832226cca7821..301868b33fc84 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,7 +524,7 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { - if (sc->no_demotion) + if (sc && sc->no_demotion) return false; if (next_demotion_node(nid) == NUMA_NO_NODE) return false; @@ -533,6 +533,31 @@ static bool can_demote(int nid, struct scan_control *sc) return false; } +static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, + int nid, + struct scan_control *sc) +{ + if (memcg == NULL) { + /* + * For non-memcg reclaim, is there + * space in any swap device? + */ + if (get_nr_swap_pages() > 0) + return true; + } else { + /* Is the memcg below its swap limit? */ + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) + return true; + } + + /* + * The page can not be swapped. + * + * Can it be reclaimed from this node via demotion? + */ + return can_demote_anon_pages(nid, sc); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -544,7 +569,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); @@ -2548,6 +2573,7 @@ enum scan_balance { static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); @@ -2558,7 +2584,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { + if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } @@ -2936,7 +2962,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; From a7e1a51a06e9d3349e5f6ab255a42311cb4e85f9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 24 Aug 2021 09:59:31 +1000 Subject: [PATCH 198/365] mm-vmscan-consider-anonymous-pages-without-swap-v11 Rename can_demote_anon_pages() to can_demote() to reflect the fact that the function is for anon and file pages. Link: https://lkml.kernel.org/r/20210715055145.195411-8-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-7-ying.huang@intel.com Cc: Keith Busch Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Cc: Michal Hocko Cc: Zi Yan Cc: Wei Xu Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 301868b33fc84..219e4f4280ec6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -555,7 +555,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, * * Can it be reclaimed from this node via demotion? */ - return can_demote_anon_pages(nid, sc); + return can_demote(nid, sc); } /* From 24f5fe20c84601ecca2f504efa90ac7a6dab7132 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:32 +1000 Subject: [PATCH 199/365] mm/vmscan: never demote for memcg reclaim Global reclaim aims to reduce the amount of memory used on a given node or set of nodes. Migrating pages to another node serves this purpose. memcg reclaim is different. Its goal is to reduce the total memory consumption of the entire memcg, across all nodes. Migration does not assist memcg reclaim because it just moves page contents between nodes rather than actually reducing memory consumption. Link: https://lkml.kernel.org/r/20210715055145.195411-9-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Yang Shi Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 219e4f4280ec6..ce8a27ece0bed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,8 +524,13 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { - if (sc && sc->no_demotion) - return false; + if (sc) { + if (sc->no_demotion) + return false; + /* It is pointless to do demotion in memcg reclaim */ + if (cgroup_reclaim(sc)) + return false; + } if (next_demotion_node(nid) == NUMA_NO_NODE) return false; From 69c7a36a7d770a56e1e9e6f1a0407db10e69aeb5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 24 Aug 2021 09:59:32 +1000 Subject: [PATCH 200/365] mm/migrate: add sysfs interface to enable reclaim migration Some method is obviously needed to enable reclaim-based migration. Just like traditional autonuma, there will be some workloads that will benefit like workloads with more "static" configurations where hot pages stay hot and cold pages stay cold. If pages come and go from the hot and cold sets, the benefits of this approach will be more limited. The benefits are truly workload-based and *not* hardware-based. We do not believe that there is a viable threshold where certain hardware configurations should have this mechanism enabled while others do not. To be conservative, earlier work defaulted to disable reclaim- based migration and did not include a mechanism to enable it. This proposes add a new sysfs file /sys/kernel/mm/numa/demotion_enabled as a method to enable it. We are open to any alternative that allows end users to enable this mechanism or disable it if workload harm is detected (just like traditional autonuma). Once this is enabled page demotion may move data to a NUMA node that does not fall into the cpuset of the allocating process. This could be construed to violate the guarantees of cpusets. However, since this is an opt-in mechanism, the assumption is that anyone enabling it is content to relax the guarantees. Link: https://lkml.kernel.org/r/20210721063926.3024591-9-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-10-ying.huang@intel.com Signed-off-by: Huang Ying Originally-by: Dave Hansen Cc: Michal Hocko Cc: Wei Xu Cc: Yang Shi Cc: Zi Yan Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../ABI/testing/sysfs-kernel-mm-numa | 24 ++++++++ include/linux/mempolicy.h | 4 ++ mm/mempolicy.c | 61 +++++++++++++++++++ mm/vmscan.c | 5 +- 4 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-numa diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa new file mode 100644 index 0000000000000..77e559d4ed800 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa @@ -0,0 +1,24 @@ +What: /sys/kernel/mm/numa/ +Date: June 2021 +Contact: Linux memory management mailing list +Description: Interface for NUMA + +What: /sys/kernel/mm/numa/demotion_enabled +Date: June 2021 +Contact: Linux memory management mailing list +Description: Enable/disable demoting pages during reclaim + + Page migration during reclaim is intended for systems + with tiered memory configurations. These systems have + multiple types of memory with varied performance + characteristics instead of plain NUMA systems where + the same kind of memory is found at varied distances. + Allowing page migration during reclaim enables these + systems to migrate pages from fast tiers to slow tiers + when the fast tier is under pressure. This migration + is performed before swap. It may move data to a NUMA + node that does not fall into the cpuset of the + allocating process which might be construed to violate + the guarantees of cpusets. This should not be enabled + on systems which need strict cpuset location + guarantees. diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0aaf91b496e2f..4ca025e2a77ef 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -184,6 +184,8 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); +extern bool numa_demotion_enabled; + #else struct mempolicy {}; @@ -292,5 +294,7 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { return NULL; } + +#define numa_demotion_enabled false #endif /* CONFIG_NUMA */ #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 939eabcaf488f..e675bfb856da7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3021,3 +3021,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif diff --git a/mm/vmscan.c b/mm/vmscan.c index ce8a27ece0bed..874efeeca4e4e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,6 +524,8 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { + if (!numa_demotion_enabled) + return false; if (sc) { if (sc->no_demotion) return false; @@ -534,8 +536,7 @@ static bool can_demote(int nid, struct scan_control *sc) if (next_demotion_node(nid) == NUMA_NO_NODE) return false; - // FIXME: actually enable this later in the series - return false; + return true; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, From a26af674f738d2cfda19e8df1e0b9b94acb22d52 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Tue, 24 Aug 2021 09:59:32 +1000 Subject: [PATCH 201/365] mm/vmpressure: replace vmpressure_to_css() with vmpressure_to_memcg() We can get memcg directly form vmpr instead of vmpr->memcg->css->memcg, so add a new func helper vmpressure_to_memcg(). And no code will use vmpressure_to_css(), so delete it. Link: https://lkml.kernel.org/r/20210630112146.455103-1-suhui@zeku.com Signed-off-by: Hui Su Acked-by: Michal Hocko Acked-by: Chris Down Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/vmpressure.h | 2 +- mm/memcontrol.c | 4 ++-- mm/vmpressure.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6d28bc433c1cf..6a2f51ebbfd35 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); -extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); +extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr); extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81fa83133c2c4..486454344ede7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -256,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) return &memcg->vmpressure; } -struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) { - return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; + return container_of(vmpr, struct mem_cgroup, vmpressure); } #ifdef CONFIG_MEMCG_KMEM diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9b172561fded7..76518e4166dc9 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) From 3ba26f4ac58e71ed83331252a40ff1b8abda155c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:32 +1000 Subject: [PATCH 202/365] mm/vmscan: remove the PageDirty check after MADV_FREE pages are page_ref_freezed Patch series "Cleanups for vmscan", v2. This series contains cleanups to remove unneeded return value, misleading setting and so on. Also this remove the PageDirty check after MADV_FREE pages are page_ref_freezed. More details can be found in the respective changelogs. This patch (of 4): If the MADV_FREE pages are redirtied before they could be reclaimed, put the pages back to anonymous LRU list by setting SwapBacked flag and the pages will be reclaimed in normal swapout way. But as Yu Zhao pointed out, "The page has only one reference left, which is from the isolation. After the caller puts the page back on lru and drops the reference, the page will be freed anyway. It doesn't matter which lru it goes." So we don't bother checking PageDirty here. [Yu Zhao's comment is also quoted in the code.] Link: https://lkml.kernel.org/r/20210717065911.61497-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210717065911.61497-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yu Zhao Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jens Axboe Cc: Joonsoo Kim Cc: Alex Shi Cc: Alistair Popple Cc: Matthew Wilcox Cc: Minchan Kim Cc: David Hildenbrand Cc: Shaohua Li Cc: Hillf Danton Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 874efeeca4e4e..776f45692e685 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1739,11 +1739,14 @@ static unsigned int shrink_page_list(struct list_head *page_list, /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { - page_ref_unfreeze(page, 1); - goto keep_locked; - } - + /* + * The page has only one reference left, which is + * from the isolation. After the caller puts the + * page back on lru and drops the reference, the + * page will be freed anyway. It doesn't matter + * which lru it goes. So we don't bother checking + * PageDirty here. + */ count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true, From 8b8096f6084637cabae3e47791a1f5454a6bbd6e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:32 +1000 Subject: [PATCH 203/365] mm/vmscan: remove misleading setting to sc->priority The priority field of sc is used to control how many pages we should scan at once while we always traverse the list to shrink the pages in these functions. So these settings are unneeded and misleading. Link: https://lkml.kernel.org/r/20210717065911.61497-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 776f45692e685..00e468b4c1c1d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1827,7 +1827,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, { struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_unmap = 1, }; struct reclaim_stat stat; @@ -2452,7 +2451,6 @@ unsigned long reclaim_pages(struct list_head *page_list) unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, From 7514fe0c25a34edeb527d238365b49a6e6ea03c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:33 +1000 Subject: [PATCH 204/365] mm/vmscan: remove unneeded return value of kswapd_run() The return value of kswapd_run() is unused now. Clean it up. Link: https://lkml.kernel.org/r/20210717065911.61497-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 2 +- mm/vmscan.c | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f30d26b0f71db..ba52f3a3478e3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void) extern void check_move_unevictable_pages(struct pagevec *pvec); -extern int kswapd_run(int nid); +extern void kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP diff --git a/mm/vmscan.c b/mm/vmscan.c index 00e468b4c1c1d..2ce151d0d54ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4441,23 +4441,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; if (pgdat->kswapd) - return 0; + return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; } - return ret; } /* From 0f518cf5d82fddf59bc76b852450943d547f42ce Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:33 +1000 Subject: [PATCH 205/365] mm/vmscan: add 'else' to remove check_pending label We could add 'else' to remove the somewhat odd check_pending label to make code core succinct. Link: https://lkml.kernel.org/r/20210717065911.61497-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2ce151d0d54ee..46e336f0ec8be 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3585,18 +3585,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ - if (!(gfp_mask & __GFP_FS)) { + if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat)); - goto check_pending; - } - - /* Throttle until kswapd wakes the process */ - wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); - -check_pending: if (fatal_signal_pending(current)) return true; From 0b8efcb0bf057dc0203a022334d5934867a6c212 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Aug 2021 09:59:33 +1000 Subject: [PATCH 206/365] mm, vmscan: guarantee drop_slab_node() termination drop_slab_node() is called as part of echo 2>/proc/sys/vm/drop_caches operation. It iterates over all memcgs and calls shrink_slab() which in turn iterates over all slab shrinkers. Freed objects are counted and as long as the total number of freed objects from all memcgs and shrinkers is higher than 10, drop_slab_node() loops for another full memcgs*shrinkers iteration. This arbitrary constant threshold of 10 can result in effectively an infinite loop on a system with large number of memcgs and/or parallel activity that allocates new objects. This has been reported previously by Chunxin Zang [1] and recently by our customer. The previous report [1] has resulted in commit 069c411de40a ("mm/vmscan: fix infinite loop in drop_slab_node") which added a check for signals allowing the user to terminate the command writing to drop_caches. At the time it was also considered to make the threshold grow with each iteration to guarantee termination, but such patch hasn't been formally proposed yet. This patch implements the dynamically growing threshold. At first iteration it's enough to free one object to continue, and this threshold effectively doubles with each iteration. Our customer's feedback was positive. There is always a risk that this change will result on some system in a previously terminating drop_caches operation to terminate sooner and free fewer objects. Ideally the semantics would guarantee freeing all freeable objects that existed at the moment of starting the operation, while not looping forever for newly allocated objects, but that's not feasible to track. In the less ideal solution based on thresholds, arguably the termination guarantee is more important than the exhaustiveness guarantee. If there are reports of large regression wrt being exhaustive, we can tune how fast the threshold grows. [1] https://lore.kernel.org/lkml/20200909152047.27905-1-zangchunxin@bytedance.com/T/#u Link: https://lkml.kernel.org/r/20210818152239.25502-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Chunxin Zang Cc: Muchun Song Cc: Chris Down Cc: Michal Hocko Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 46e336f0ec8be..c0a7e2a3651f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -939,6 +939,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, void drop_slab_node(int nid) { unsigned long freed; + int shift = 0; do { struct mem_cgroup *memcg = NULL; @@ -951,7 +952,7 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while (freed > 10); + } while ((freed >> shift++) > 0); } void drop_slab(void) From a026b83fa9cef563a556b8904d2faa78178c25e2 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Tue, 24 Aug 2021 09:59:33 +1000 Subject: [PATCH 207/365] mm: compaction: optimize proactive compaction deferrals Vlastimil Babka figured out that when fragmentation score didn't go down across the proactive compaction i.e. when no progress is made, next wake up for proactive compaction is deferred for 1 << COMPACT_MAX_DEFER_SHIFT, i.e. 64 times, with each wakeup interval of HPAGE_FRAG_CHECK_INTERVAL_MSEC(=500). In each of this wakeup, it just decrement 'proactive_defer' counter and goes sleep i.e. it is getting woken to just decrement a counter. The same deferral time can also achieved by simply doing the HPAGE_FRAG_CHECK_INTERVAL_MSEC << COMPACT_MAX_DEFER_SHIFT thus unnecessary wakeup of kcompact thread is avoided thus also removes the need of 'proactive_defer' thread counter. Link: https://lore.kernel.org/linux-fsdevel/88abfdb6-2c13-b5a6-5b46-742d12d1c910@suse.cz/ Link: https://lkml.kernel.org/r/1626869599-25412-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Reviewed-by: Khalid Aziz Acked-by: David Rientjes Cc: Nitin Gupta Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/compaction.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 61fb64f47a069..8b111bad89a18 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2885,7 +2885,8 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - unsigned int proactive_defer = 0; + long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); + long timeout = default_timeout; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2902,23 +2903,30 @@ static int kcompactd(void *p) trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), - msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + kcompactd_work_requested(pgdat), timeout)) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); psi_memstall_leave(&pflags); + /* + * Reset the timeout value. The defer timeout by + * proactive compaction can effectively lost + * here but that is fine as the condition of the + * zone changed substantionally and carrying on + * with the previous defer is not useful. + */ + timeout = default_timeout; continue; } - /* kcompactd wait timeout */ + /* + * Start the proactive work with default timeout. Based + * on the fragmentation score, this timeout is updated. + */ + timeout = default_timeout; if (should_proactive_compact_node(pgdat)) { unsigned int prev_score, score; - if (proactive_defer) { - proactive_defer--; - continue; - } prev_score = fragmentation_score_node(pgdat); proactive_compact_node(pgdat); score = fragmentation_score_node(pgdat); @@ -2926,8 +2934,9 @@ static int kcompactd(void *p) * Defer proactive compaction if the fragmentation * score did not go down i.e. no progress made. */ - proactive_defer = score < prev_score ? - 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + if (unlikely(score >= prev_score)) + timeout = + default_timeout << COMPACT_MAX_DEFER_SHIFT; } } From 7cbda6083ae15c77739e8dd50ca01ab95fc8e58f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:33 +1000 Subject: [PATCH 208/365] mm-compaction-optimize-proactive-compaction-deferrals-fix tweak comment Cc: Charan Teja Reddy Cc: David Rientjes Cc: Khalid Aziz Cc: Nitin Gupta Cc: Vinayak Menon Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/compaction.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 8b111bad89a18..4ee0d40d93f2b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2909,11 +2909,11 @@ static int kcompactd(void *p) kcompactd_do_work(pgdat); psi_memstall_leave(&pflags); /* - * Reset the timeout value. The defer timeout by - * proactive compaction can effectively lost - * here but that is fine as the condition of the - * zone changed substantionally and carrying on - * with the previous defer is not useful. + * Reset the timeout value. The defer timeout from + * proactive compaction is lost here but that is fine + * as the condition of the zone changing substantionally + * then carrying on with the previous defer interval is + * not useful. */ timeout = default_timeout; continue; From ef5a1ba98bc828b178b479b4726347e6d60fc544 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Tue, 24 Aug 2021 09:59:34 +1000 Subject: [PATCH 209/365] mm: compaction: support triggering of proactive compaction by user The proactive compaction[1] gets triggered for every 500msec and run compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages based on the value set to sysctl.compaction_proactiveness. Triggering the compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is not needed for all applications, especially on the embedded system usecases which may have few MB's of RAM. Enabling the proactive compaction in its state will endup in running almost always on such systems. Other side, proactive compaction can still be very much useful for getting a set of higher order pages in some controllable manner(controlled by using the sysctl.compaction_proactiveness). So, on systems where enabling the proactive compaction always may proove not required, can trigger the same from user space on write to its sysctl interface. As an example, say app launcher decide to launch the memory heavy application which can be launched fast if it gets more higher order pages thus launcher can prepare the system in advance by triggering the proactive compaction from userspace. This triggering of proactive compaction is done on a write to sysctl.compaction_proactiveness by user. [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: Rafael Aquini Cc: Mike Rapoport Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Mel Gorman Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Khalid Aziz Cc: David Rientjes Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/vm.rst | 3 +- include/linux/compaction.h | 2 ++ include/linux/mmzone.h | 1 + kernel/sysctl.c | 2 +- mm/compaction.c | 38 +++++++++++++++++++++++-- 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 003d5cc3751b3..b526cf6793a69 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -118,7 +118,8 @@ compaction_proactiveness This tunable takes a value in the range [0, 100] with a default value of 20. This tunable determines how aggressively compaction is done in the -background. Setting it to 0 disables proactive compaction. +background. On write of non zero value to this tunable will immediately +trigger the proactive compaction. Setting it to 0 disables proactive compaction. Note that compaction has a non-trivial system-wide impact as pages belonging to different processes are moved around, which could also lead diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c24098c7accac..34bce35c808dd 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order) extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59bad25ce78e0..1bd5f5955f9a7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -846,6 +846,7 @@ typedef struct pglist_data { enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; + bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8cf..297f0b3966bd3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2871,7 +2871,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_compaction_proactiveness, .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, diff --git a/mm/compaction.c b/mm/compaction.c index 4ee0d40d93f2b..fa9b2b598eabe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2706,6 +2706,30 @@ static void compact_nodes(void) */ unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc, nid; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write && sysctl_compaction_proactiveness) { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->proactive_compact_trigger) + continue; + + pgdat->proactive_compact_trigger = true; + wake_up_interruptible(&pgdat->kcompactd_wait); + } + } + + return 0; +} + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); + return pgdat->kcompactd_max_order > 0 || kthread_should_stop() || + pgdat->proactive_compact_trigger; } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -2901,9 +2926,16 @@ static int kcompactd(void *p) while (!kthread_should_stop()) { unsigned long pflags; + /* + * Avoid the unnecessary wakeup for proactive compaction + * when it is disabled. + */ + if (!sysctl_compaction_proactiveness) + timeout = MAX_SCHEDULE_TIMEOUT; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), timeout)) { + kcompactd_work_requested(pgdat), timeout) && + !pgdat->proactive_compact_trigger) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); @@ -2938,6 +2970,8 @@ static int kcompactd(void *p) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; } + if (unlikely(pgdat->proactive_compact_trigger)) + pgdat->proactive_compact_trigger = false; } return 0; From dd7bc410c396113fdaf733a176d1056f50254a1e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:34 +1000 Subject: [PATCH 210/365] mm-compaction-support-triggering-of-proactive-compaction-by-user-fix tweak vm.rst, per Mike Cc: Mike Rapoport Cc: Charan Teja Reddy Cc: Dave Hansen Cc: David Rientjes Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Khalid Aziz Cc: Luis Chamberlain Cc: Mel Gorman Cc: Nitin Gupta Cc: Vinayak Menon Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/vm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index b526cf6793a69..5e795202111f2 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -118,7 +118,7 @@ compaction_proactiveness This tunable takes a value in the range [0, 100] with a default value of 20. This tunable determines how aggressively compaction is done in the -background. On write of non zero value to this tunable will immediately +background. Write of a non zero value to this tunable will immediately trigger the proactive compaction. Setting it to 0 disables proactive compaction. Note that compaction has a non-trivial system-wide impact as pages From 91a01033a5d29feb19b8f02bbdb237a11474184a Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Tue, 24 Aug 2021 09:59:34 +1000 Subject: [PATCH 211/365] mm/mempolicy: convert from atomic_t to refcount_t on mempolicy->refcnt refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Link: https://lkml.kernel.org/r/1626683671-64407-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Acked-by: Ben Widawsky Reviewed-by: Muchun Song Cc: Feng Tang Cc: Mike Kravetz Cc: Muchun Song Cc: Yanfei Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mempolicy.h | 5 +++-- mm/mempolicy.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4ca025e2a77ef..0117e1ec7b1e1 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -6,6 +6,7 @@ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 +#include #include #include #include @@ -43,7 +44,7 @@ struct mm_struct; * to 1, representing the caller of mpol_dup(). */ struct mempolicy { - atomic_t refcnt; + refcount_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ @@ -94,7 +95,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) static inline void mpol_get(struct mempolicy *pol) { if (pol) - atomic_inc(&pol->refcnt); + refcount_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e675bfb856da7..1dd82b8172349 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -298,7 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); - atomic_set(&policy->refcnt, 1); + refcount_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; @@ -308,7 +308,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!refcount_dec_and_test(&p->refcnt)) return; kmem_cache_free(policy_cache, p); } @@ -2290,7 +2290,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } - atomic_set(&new->refcnt, 1); + refcount_set(&new->refcnt, 1); return new; } @@ -2581,7 +2581,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, goto alloc_new; *mpol_new = *n->policy; - atomic_set(&mpol_new->refcnt, 1); + refcount_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); From 74b9dc76d42a3fbf67556077ad343015bbff22e0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:34 +1000 Subject: [PATCH 212/365] mm-mempolicy-convert-from-atomic_t-to-refcount_t-on-mempolicy-refcnt-fix fix warnings mm/mempolicy.c:125:42: warning: missing braces around initializer [-Wmissing-braces] 125 | static struct mempolicy default_policy = { | ^ mm/mempolicy.c:125:42: warning: missing braces around initializer [-Wmissing-braces] mm/mempolicy.c: In function 'numa_policy_init': mm/mempolicy.c:2815:32: warning: missing braces around initializer [-Wmissing-braces] 2815 | preferred_node_policy[nid] = (struct mempolicy) { | ^ mm/mempolicy.c:2815:32: warning: missing braces around initializer [-Wmissing-braces] Cc: Xiyu Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1dd82b8172349..6e92d6b011526 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -120,7 +120,7 @@ enum zone_type policy_zone = 0; * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { - .refcnt = ATOMIC_INIT(1), /* never free it */ + .refcnt = { ATOMIC_INIT(1), }, /* never free it */ .mode = MPOL_LOCAL, }; @@ -2775,7 +2775,7 @@ void __init numa_policy_init(void) for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { - .refcnt = ATOMIC_INIT(1), + .refcnt = { ATOMIC_INIT(1), }, .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), From be5ff1ab2d94ebe72e87ecfeae6ad44ddb2806c1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 24 Aug 2021 09:59:34 +1000 Subject: [PATCH 213/365] mm/mempolicy: use readable NUMA_NO_NODE macro instead of magic numer The caller of mpol_misplaced() already use NUMA_NO_NODE to check whether current page node is misplaced, thus using NUMA_NO_NODE in mpol_misplaced() instead of magic number is more readable. Link: https://lkml.kernel.org/r/1b77c0ce21183fa86f4db250b115cf5e27396528.1627558356.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6e92d6b011526..174e172a0bfd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2425,8 +2425,8 @@ static void sp_free(struct sp_node *n) * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * - * Return: -1 if the page is in a node that is valid for this policy, or a - * suitable node ID to allocate a replacement page from. + * Return: NUMA_NO_NODE if the page is in a node that is valid for this + * policy, or a suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { @@ -2437,7 +2437,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; - int ret = -1; + int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) From cf5c2b18f1facc4dc36278d752bfb9c1f0fefc36 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 Aug 2021 09:59:35 +1000 Subject: [PATCH 214/365] mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Signed-off-by: Dave Hansen Signed-off-by: Feng Tang Cc: Michal Hocko Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying b Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 73 +++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 19a00bc7fe865..046d0ccba4cd7 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -22,6 +22,7 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, + MPOL_PREFERRED_MANY, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 174e172a0bfd6..6614ac1dcf739 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -31,6 +31,9 @@ * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -207,6 +210,14 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } +static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->nodes = *nodes; + return 0; +} + static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) @@ -408,6 +419,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_preferred_many, + .rebind = mpol_rebind_preferred, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -900,6 +915,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: *nodes = p->nodes; break; case MPOL_LOCAL: @@ -1446,7 +1462,13 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX) + + /* + * The check should be 'mode >= MPOL_MAX', but as 'prefer_many' + * is not fully implemented, don't permit it to be used for now, + * and the logic will be restored in following patch + */ + if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1875,16 +1897,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { + int mode = policy->mode; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + if (unlikely(mode == MPOL_BIND) && + apply_policy_zone(policy, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + return &policy->nodes; + + if (mode == MPOL_PREFERRED_MANY) return &policy->nodes; return NULL; } -/* Return the node id preferred by the given mempolicy, or the given id */ +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED) { @@ -1936,7 +1969,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { struct zoneref *z; /* @@ -2008,12 +2043,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy - * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ @@ -2021,16 +2056,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; + int mode; *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + *nodemask = NULL; + mode = (*mpol)->mode; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) + if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) *nodemask = &(*mpol)->nodes; } return nid; @@ -2063,6 +2100,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; @@ -2173,7 +2211,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * - * If the policy is interleave, or does not allow the current + * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED) @@ -2311,6 +2349,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2451,6 +2490,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: + if (node_isset(curnid, pol->nodes)) + goto out; polnid = first_node(pol->nodes); break; @@ -2465,9 +2506,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; goto out; } + fallthrough; + case MPOL_PREFERRED_MANY: /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. @@ -2829,6 +2871,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2907,6 +2950,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -2993,6 +3037,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_LOCAL: break; case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; From 932ba5a6e18aaaf185d2286657b0cdb21030300d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 24 Aug 2021 09:59:35 +1000 Subject: [PATCH 215/365] mm/memplicy: add page allocation function for MPOL_PREFERRED_MANY policy The semantics of MPOL_PREFERRED_MANY is similar to MPOL_PREFERRED, that it will first try to allocate memory from the preferred node(s), and fallback to all nodes in system when first try fails. Add a dedicated function alloc_pages_preferred_many() for it just like for 'interleave' policy, which will be used by 2 general memoory allocation APIs: alloc_pages() and alloc_pages_vma() Link: https://lore.kernel.org/r/20200630212517.308045-9-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-3-git-send-email-feng.tang@intel.com Suggested-by: Michal Hocko Originally-by: Ben Widawsky Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6614ac1dcf739..2fa8a98a5e436 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2166,6 +2166,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + if (!page) + page = __alloc_pages(gfp, order, numa_node_id(), NULL); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @gfp: GFP flags. @@ -2201,6 +2222,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (pol->mode == MPOL_PREFERRED_MANY) { + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + goto out; + } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; @@ -2278,6 +2305,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + numa_node_id(), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), From 28f6f9cb4a3a2017f1792573a0df81fb0d93bf35 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 24 Aug 2021 09:59:35 +1000 Subject: [PATCH 216/365] mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY Implement the missing huge page allocation functionality while obeying the preferred node semantics. This is similar to the implementation for general page allocation, as it uses a fallback mechanism to try multiple preferred nodes first, and then all other nodes. To avoid adding too many "#ifdef CONFIG_NUMA" check, add a helper function in mempolicy.h to check whether a mempolicy is MPOL_PREFERRED_MANY. [akpm@linux-foundation.org: fix compiling issue when merging with other hugetlb patch] [Thanks to 0day bot for catching the !CONFIG_NUMA compiling issue] [mhocko@suse.com: suggest to remove the #ifdef CONFIG_NUMA check] Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Co-developed-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd1c1e7d970b7..2b1732ad88f71 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1166,7 +1166,20 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); +#ifdef CONFIG_NUMA + if (mpol->mode == MPOL_PREFERRED_MANY) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page) + goto check_reserve; + /* Fallback to all nodes */ + nodemask = NULL; + } +#endif page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + +#ifdef CONFIG_NUMA +check_reserve: +#endif if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2149,6 +2162,21 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); +#ifdef CONFIG_NUMA + if (mpol->mode == MPOL_PREFERRED_MANY) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + if (page) { + mpol_cond_put(mpol); + return page; + } + + /* Fallback to all nodes */ + nodemask = NULL; + } +#endif page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); mpol_cond_put(mpol); From 9a994fbdc0a516ffbef5a26edaaae7f9d31071a7 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 24 Aug 2021 09:59:35 +1000 Subject: [PATCH 217/365] mm-hugetlb-add-support-for-mempolicy-mpol_preferred_many-fix add helpers to avoid ifdefs Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com Signed-off-by: Feng Tang Signed-off-by: Ben Widawsky Suggested-by: Michal Hocko Co-developed-by: Feng Tang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mempolicy.h | 12 ++++++++++++ mm/hugetlb.c | 34 +++++++++++++--------------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0117e1ec7b1e1..60d5e6c3340c1 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -187,6 +187,12 @@ extern void mpol_put_task_policy(struct task_struct *); extern bool numa_demotion_enabled; +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return (pol->mode == MPOL_PREFERRED_MANY); +} + + #else struct mempolicy {}; @@ -297,5 +303,11 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) } #define numa_demotion_enabled false + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return false; +} + #endif /* CONFIG_NUMA */ #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2b1732ad88f71..371fec524ca0d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1145,7 +1145,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1166,20 +1166,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); -#ifdef CONFIG_NUMA - if (mpol->mode == MPOL_PREFERRED_MANY) { + + if (mpol_is_preferred_many(mpol)) { page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); - if (page) - goto check_reserve; - /* Fallback to all nodes */ + + /* Fallback to all nodes if page==NULL */ nodemask = NULL; } -#endif - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); -#ifdef CONFIG_NUMA -check_reserve: -#endif + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2162,24 +2159,19 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); -#ifdef CONFIG_NUMA - if (mpol->mode == MPOL_PREFERRED_MANY) { + if (mpol_is_preferred_many(mpol)) { gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); - if (page) { - mpol_cond_put(mpol); - return page; - } - /* Fallback to all nodes */ + /* Fallback to all nodes if page==NULL */ nodemask = NULL; } -#endif - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); - mpol_cond_put(mpol); + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + mpol_cond_put(mpol); return page; } From b0eb7fcac8514917091fb0d6c3e87fab53ce5df4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 24 Aug 2021 09:59:35 +1000 Subject: [PATCH 218/365] mm/hugetlb: Initialize page to NULL in alloc_buddy_huge_page_with_mpol() Clang warns: mm/hugetlb.c:2162:6: warning: variable 'page' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] if (mpol_is_preferred_many(mpol)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ mm/hugetlb.c:2172:7: note: uninitialized use occurs here if (!page) ^~~~ mm/hugetlb.c:2162:2: note: remove the 'if' if its condition is always true if (mpol_is_preferred_many(mpol)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mm/hugetlb.c:2155:19: note: initialize the variable 'page' to silence this warning struct page *page; ^ = NULL 1 warning generated. Initialize page to NULL like in dequeue_huge_page_vma() so that page is not used uninitialized. Link: https://lkml.kernel.org/r/20210810200632.3812797-1-nathan@kernel.org Signed-off-by: Nathan Chancellor Cc: Mike Kravetz Cc: Feng Tang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 371fec524ca0d..a9ddcff4e2969 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2152,7 +2152,7 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; From 57a365d6d147d78fc84f48605241523223bbc99d Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 24 Aug 2021 09:59:36 +1000 Subject: [PATCH 219/365] mm/mempolicy: advertise new MPOL_PREFERRED_MANY Adds a new mode to the existing mempolicy modes, MPOL_PREFERRED_MANY. MPOL_PREFERRED_MANY will be adequately documented in the internal admin-guide with this patch. Eventually, the man pages for mbind(2), get_mempolicy(2), set_mempolicy(2) and numactl(8) will also have text about this mode. Those shall contain the canonical reference. NUMA systems continue to become more prevalent. New technologies like PMEM make finer grain control over memory access patterns increasingly desirable. MPOL_PREFERRED_MANY allows userspace to specify a set of nodes that will be tried first when performing allocations. If those allocations fail, all remaining nodes will be tried. It's a straight forward API which solves many of the presumptive needs of system administrators wanting to optimize workloads on such machines. The mode will work either per VMA, or per thread. [Michal Hocko: refine kernel doc for MPOL_PREFERRED_MANY] Link: https://lore.kernel.org/r/20200630212517.308045-13-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-5-git-send-email-feng.tang@intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/mm/numa_memory_policy.rst | 15 +++++++++++---- mm/mempolicy.c | 7 +------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 067a90a1499c4..64fd0ba0d0570 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -245,6 +245,13 @@ MPOL_INTERLEAVED address range or file. During system boot up, the temporary interleaved system default policy works in this mode. +MPOL_PREFERRED_MANY + This mode specifices that the allocation should be preferrably + satisfied from the nodemask specified in the policy. If there is + a memory pressure on all nodes in the nodemask, the allocation + can fall back to all existing numa nodes. This is effectively + MPOL_PREFERRED allowed for a mask rather than a single node. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES @@ -253,10 +260,10 @@ MPOL_F_STATIC_NODES nodes changes after the memory policy has been defined. Without this flag, any time a mempolicy is rebound because of a - change in the set of allowed nodes, the node (Preferred) or - nodemask (Bind, Interleave) is remapped to the new set of - allowed nodes. This may result in nodes being used that were - previously undesired. + change in the set of allowed nodes, the preferred nodemask (Preferred + Many), preferred node (Preferred) or nodemask (Bind, Interleave) is + remapped to the new set of allowed nodes. This may result in nodes + being used that were previously undesired. With this flag, if the user-specified nodes overlap with the nodes allowed by the task's cpuset, then the memory policy is diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2fa8a98a5e436..cab44add7a4aa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1463,12 +1463,7 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - /* - * The check should be 'mode >= MPOL_MAX', but as 'prefer_many' - * is not fully implemented, don't permit it to be used for now, - * and the logic will be restored in following patch - */ - if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY) + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; From ff2f42cac2f805d3d0cce809d9146ee6160d5048 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 24 Aug 2021 09:59:36 +1000 Subject: [PATCH 220/365] mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies As they all do the same thing: sanity check and save nodemask info, create one mpol_new_nodemask() to reduce redundancy. Link: https://lkml.kernel.org/r/1627970362-61305-6-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Ben Widawsky Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cab44add7a4aa..ee4dc43785b3e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -192,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); } -static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; @@ -210,22 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (nodes_empty(*nodes)) - return -EINVAL; - pol->nodes = *nodes; - return 0; -} - -static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (nodes_empty(*nodes)) - return -EINVAL; - pol->nodes = *nodes; - return 0; -} - /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes @@ -405,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { - .create = mpol_new_interleave, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { @@ -413,14 +397,14 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { - .create = mpol_new_bind, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { - .create = mpol_new_preferred_many, + .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, }; From bbc8c0f2b253b0e0767f3efcf9ca839bd70a9ea8 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 24 Aug 2021 09:59:36 +1000 Subject: [PATCH 221/365] mm/mempolicy.c: use in_task() in mempolicy_slab_node() Obsoleted in_intrrupt() include task context with disabled BH, it's better to use in_task() instead. Link: https://lkml.kernel.org/r/984ee771-4834-21da-801f-c15c18ddf4d1@virtuozzo.com Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ee4dc43785b3e..d9e9002973148 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1934,7 +1934,7 @@ unsigned int mempolicy_slab_node(void) struct mempolicy *policy; int node = numa_mem_id(); - if (in_interrupt()) + if (!in_task()) return node; policy = current->mempolicy; From 461ef12c4375dbd898f21794f6087e0cd6353d35 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:36 +1000 Subject: [PATCH 222/365] memblock: make memblock_find_in_range method private There are a lot of uses of memblock_find_in_range() along with memblock_reserve() from the times memblock allocation APIs did not exist. memblock_find_in_range() is the very core of memblock allocations, so any future changes to its internal behaviour would mandate updates of all the users outside memblock. Replace the calls to memblock_find_in_range() with an equivalent calls to memblock_phys_alloc() and memblock_phys_alloc_range() and make memblock_find_in_range() private method of memblock. This simplifies the callers, ensures that (unlikely) errors in memblock_reserve() are handled and improves maintainability of memblock_find_in_range(). Link: https://lkml.kernel.org/r/20210816122622.30279-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Catalin Marinas [arm64] Acked-by: Kirill A. Shutemov Acked-by: Rafael J. Wysocki [ACPI] Acked-by: Russell King (Oracle) Acked-by: Nick Kossifidis [riscv] Tested-by: Guenter Roeck Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/kernel/setup.c | 20 +++++--------- arch/arm64/kvm/hyp/reserved_mem.c | 9 +++---- arch/arm64/mm/init.c | 36 ++++++++----------------- arch/mips/kernel/setup.c | 14 +++++----- arch/riscv/mm/init.c | 44 ++++++++++--------------------- arch/s390/kernel/setup.c | 10 ++++--- arch/x86/kernel/aperture_64.c | 5 ++-- arch/x86/mm/init.c | 23 ++++++++++------ arch/x86/mm/numa.c | 5 ++-- arch/x86/mm/numa_emulation.c | 5 ++-- arch/x86/realmode/init.c | 2 +- drivers/acpi/tables.c | 5 ++-- drivers/base/arch_numa.c | 5 +--- drivers/of/of_reserved_mem.c | 12 ++++++--- include/linux/memblock.h | 2 -- mm/memblock.c | 2 +- 16 files changed, 81 insertions(+), 118 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index f97eb2371672f..284a80c0b6e10 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1012,31 +1012,25 @@ static void __init reserve_crashkernel(void) unsigned long long lowmem_max = __pa(high_memory - 1) + 1; if (crash_max > lowmem_max) crash_max = lowmem_max; - crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max, - crash_size, CRASH_ALIGN); + + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, crash_max); if (!crash_base) { pr_err("crashkernel reservation failed - No suitable area found.\n"); return; } } else { + unsigned long long crash_max = crash_base + crash_size; unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, SECTION_SIZE); - if (start != crash_base) { + start = memblock_phys_alloc_range(crash_size, SECTION_SIZE, + crash_base, crash_max); + if (!start) { pr_err("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret < 0) { - pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n", - (unsigned long)crash_base); - return; - } - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", (unsigned long)(crash_size >> 20), (unsigned long)(crash_base >> 20), diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c index d654921dd09b5..578670e3f608a 100644 --- a/arch/arm64/kvm/hyp/reserved_mem.c +++ b/arch/arm64/kvm/hyp/reserved_mem.c @@ -92,12 +92,10 @@ void __init kvm_hyp_reserve(void) * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. */ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - ALIGN(hyp_mem_size, PMD_SIZE), - PMD_SIZE); + hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); if (!hyp_mem_base) - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - hyp_mem_size, PAGE_SIZE); + hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); else hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); @@ -105,7 +103,6 @@ void __init kvm_hyp_reserve(void) kvm_err("Failed to reserve hyp memory\n"); return; } - memblock_reserve(hyp_mem_base, hyp_mem_size); kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8490ed2917ff2..0bffd2d1854f0 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init; static void __init reserve_crashkernel(void) { unsigned long long crash_base, crash_size; + unsigned long long crash_max = arm64_dma_phys_limit; int ret; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), @@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_find_in_range(0, arm64_dma_phys_limit, - crash_size, SZ_2M); - if (crash_base == 0) { - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region is not memory\n"); - return; - } + /* User specifies base address explicitly. */ + if (crash_base) + crash_max = crash_base + crash_size; - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); - return; - } - - if (!IS_ALIGNED(crash_base, SZ_2M)) { - pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); - return; - } + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_phys_alloc_range(crash_size, SZ_2M, + crash_base, crash_max); + if (!crash_base) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 23a140327a0ba..f979adfd4fc20 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -452,8 +452,9 @@ static void __init mips_parse_crashkernel(void) return; if (crash_base <= 0) { - crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, + CRASH_ADDR_MAX); if (!crash_base) { pr_warn("crashkernel reservation failed - No suitable area found.\n"); return; @@ -461,8 +462,9 @@ static void __init mips_parse_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, crash_base + crash_size, - crash_size, 1); + start = memblock_phys_alloc_range(crash_size, 1, + crash_base, + crash_base + crash_size); if (start != crash_base) { pr_warn("Invalid memory region reserved for crash kernel\n"); return; @@ -656,10 +658,6 @@ static void __init arch_mem_init(char **cmdline_p) mips_reserve_vmcore(); mips_parse_crashkernel(); -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - memblock_reserve(crashk_res.start, resource_size(&crashk_res)); -#endif device_tree_init(); /* diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 7cb4f391d106f..e6cac495a9e8d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -819,38 +819,22 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - */ - crash_base = memblock_find_in_range(search_start, search_end, - crash_size, PMD_SIZE); - - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is not memory\n"); - return; - } - - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is reserved\n"); - return; - } - + if (crash_base) { + search_start = crash_base; + search_end = crash_base + crash_size; + } - if (!IS_ALIGNED(crash_base, PMD_SIZE)) { - pr_warn("crashkernel: requested region is misaligned\n"); - return; - } + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + */ + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ff0f9e8389162..3d9efee0f43c9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -626,8 +626,9 @@ static void __init reserve_crashkernel(void) return; } low = crash_base ?: low; - crash_base = memblock_find_in_range(low, high, crash_size, - KEXEC_CRASH_MEM_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); } if (!crash_base) { @@ -636,14 +637,15 @@ static void __init reserve_crashkernel(void) return; } - if (register_memory_notifier(&kdump_mem_nb)) + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_free(crash_base, crash_size); return; + } if (!OLDMEM_BASE && MACHINE_IS_VM) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - memblock_remove(crash_base, crash_size); pr_info("Reserving %lluMB of memory at %lluMB " "for crashkernel (System RAM: %luMB)\n", crash_size >> 20, crash_base >> 20, diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0ec..10562885f5fc6 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, - aper_size, aper_size); + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); if (!addr) { pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); return 0; } - memblock_reserve(addr, aper_size); pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 75ef19aa89037..23a14d82e7838 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { - ret = memblock_find_in_range( + ret = memblock_phys_alloc_range( + PAGE_SIZE * num, PAGE_SIZE, min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE * num , PAGE_SIZE); + max_pfn_mapped << PAGE_SHIFT); } - if (ret) - memblock_reserve(ret, PAGE_SIZE * num); - else if (can_use_brk_pgt) + if (!ret && can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) @@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start, unsigned long addr; unsigned long mapped_ram_size = 0; - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + /* + * Systems that have many reserved areas near top of the memory, + * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will + * require lots of 4K mappings which may exhaust pgt_buf. + * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure + * there is enough mapped memory that can be allocated from + * memblock. + */ + addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, + map_end); + memblock_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e94da744386f3..a1b5c71099e61 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void) cnt++; size = cnt * cnt * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 87d77cc52f86e..737491b13728c 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - phys_size, PAGE_SIZE); + phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 6534c92d0f83f..31b5856010cba 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -28,7 +28,7 @@ void __init reserve_real_mode(void) WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); if (!mem) pr_info("No sub-1M memory is available for the trampoline\n"); else diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index a37a1532a5757..f9383736fa0fe 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -583,8 +583,8 @@ void __init acpi_table_upgrade(void) } acpi_tables_addr = - memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, - all_tables_size, PAGE_SIZE); + memblock_phys_alloc_range(all_tables_size, PAGE_SIZE, + 0, ACPI_TABLE_UPGRADE_MAX_PHYS); if (!acpi_tables_addr) { WARN_ON(1); return; @@ -599,7 +599,6 @@ void __init acpi_table_upgrade(void) * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area) * works fine. */ - memblock_reserve(acpi_tables_addr, all_tables_size); arch_reserve_mem_area(acpi_tables_addr, all_tables_size); /* diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 4cc4e117727d8..46c503486e962 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -279,13 +279,10 @@ static int __init numa_alloc_distance(void) int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); if (WARN_ON(!phys)) return -ENOMEM; - memblock_reserve(phys, size); - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index fd3964d242243..59c1390cdf423 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -33,18 +33,22 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, phys_addr_t *res_base) { phys_addr_t base; + int err = 0; end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end; align = !align ? SMP_CACHE_BYTES : align; - base = memblock_find_in_range(start, end, size, align); + base = memblock_phys_alloc_range(size, align, start, end); if (!base) return -ENOMEM; *res_base = base; - if (nomap) - return memblock_mark_nomap(base, size); + if (nomap) { + err = memblock_mark_nomap(base, size); + if (err) + memblock_free(base, size); + } - return memblock_reserve(base, size); + return err; } /* diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 4a53c3ca86bdc..b066024c62e3f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -99,8 +99,6 @@ void memblock_discard(void); static inline void memblock_discard(void) {} #endif -phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index a69449bffc8d2..e6b4654f9dfda 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -315,7 +315,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, +static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { From 107190fb1d3c9b3d76cf2a69f2eae509b2fa4268 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 24 Aug 2021 09:59:36 +1000 Subject: [PATCH 223/365] mm: introduce process_mrelease system call In modern systems it's not unusual to have a system component monitoring memory conditions of the system and tasked with keeping system memory pressure under control. One way to accomplish that is to kill non-essential processes to free up memory for more important ones. Examples of this are Facebook's OOM killer daemon called oomd and Android's low memory killer daemon called lmkd. For such system component it's important to be able to free memory quickly and efficiently. Unfortunately the time process takes to free up its memory after receiving a SIGKILL might vary based on the state of the process (uninterruptible sleep), size and OPP level of the core the process is running. A mechanism to free resources of the target process in a more predictable way would improve system's ability to control its memory pressure. Introduce process_mrelease system call that releases memory of a dying process from the context of the caller. This way the memory is freed in a more controllable way with CPU affinity and priority of the caller. The workload of freeing the memory will also be charged to the caller. The operation is allowed only on a dying process. After previous discussions [1, 2, 3] the decision was made [4] to introduce a dedicated system call to cover this use case. The API is as follows, int process_mrelease(int pidfd, unsigned int flags); DESCRIPTION The process_mrelease() system call is used to free the memory of an exiting process. The pidfd selects the process referred to by the PID file descriptor. (See pidfd_open(2) for further information) The flags argument is reserved for future use; currently, this argument must be specified as 0. RETURN VALUE On success, process_mrelease() returns 0. On error, -1 is returned and errno is set to indicate the error. ERRORS EBADF pidfd is not a valid PID file descriptor. EAGAIN Failed to release part of the address space. EINTR The call was interrupted by a signal; see signal(7). EINVAL flags is not 0. EINVAL The memory of the task cannot be released because the process is not exiting, the address space is shared with another live process or there is a core dump in progress. ENOSYS This system call is not supported, for example, without MMU support built into Linux. ESRCH The target process does not exist (i.e., it has terminated and been waited on). [1] https://lore.kernel.org/lkml/20190411014353.113252-3-surenb@google.com/ [2] https://lore.kernel.org/linux-api/20201113173448.1863419-1-surenb@google.com/ [3] https://lore.kernel.org/linux-api/20201124053943.1684874-3-surenb@google.com/ [4] https://lore.kernel.org/linux-api/20201223075712.GA4719@lst.de/ Link: https://lkml.kernel.org/r/20210809185259.405936-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Christian Brauner Cc: David Rientjes Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Roman Gushchin Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jann Horn Cc: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Florian Weimer Cc: Jan Engelhardt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/oom_kill.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c729a4c4a1ace..831340e7ad8b4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1141,3 +1142,72 @@ void pagefault_out_of_memory(void) out_of_memory(&oc); mutex_unlock(&oom_lock); } + +SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU + struct mm_struct *mm = NULL; + struct task_struct *task; + struct task_struct *p; + unsigned int f_flags; + bool reap = true; + struct pid *pid; + long ret = 0; + + if (flags) + return -EINVAL; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + /* + * Make sure to choose a thread which still has a reference to mm + * during the group exit + */ + p = find_lock_task_mm(task); + if (!p) { + ret = -ESRCH; + goto put_task; + } + + mm = p->mm; + mmgrab(mm); + + /* If the work has been done already, just exit with success */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) + reap = false; + else if (!task_will_free_mem(p)) { + reap = false; + ret = -EINVAL; + } + task_unlock(p); + + if (!reap) + goto drop_mm; + + if (mmap_read_lock_killable(mm)) { + ret = -EINTR; + goto drop_mm; + } + if (!__oom_reap_task_mm(mm)) + ret = -EAGAIN; + mmap_read_unlock(mm); + +drop_mm: + mmdrop(mm); +put_task: + put_task_struct(task); +put_pid: + put_pid(pid); + return ret; +#else + return -ENOSYS; +#endif /* CONFIG_MMU */ +} From a056c4704738204db04bb7779182a17f3daf6bb1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 24 Aug 2021 09:59:37 +1000 Subject: [PATCH 224/365] mm: wire up syscall process_mrelease Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/kernel/syscalls/syscall.tbl | 2 ++ arch/arm/tools/syscall.tbl | 2 ++ arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 2 ++ arch/m68k/kernel/syscalls/syscall.tbl | 2 ++ arch/microblaze/kernel/syscalls/syscall.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n32.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n64.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_o32.tbl | 2 ++ arch/parisc/kernel/syscalls/syscall.tbl | 2 ++ arch/powerpc/kernel/syscalls/syscall.tbl | 2 ++ arch/s390/kernel/syscalls/syscall.tbl | 2 ++ arch/sh/kernel/syscalls/syscall.tbl | 2 ++ arch/sparc/kernel/syscalls/syscall.tbl | 2 ++ arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 2 ++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 1 + 21 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a17687ed4b519..605645eae04cd 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -486,3 +486,5 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +# 557 reserved for memfd_secret +558 common process_mrelease sys_process_mrelease diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c5df1179fc5d0..2f32eb8beca8f 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -460,3 +460,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 727bfc3be99b3..3cb206aea3db1 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 447 +#define __NR_compat_syscalls 449 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 99ffcafc736ca..0f49cdb180dd0 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 6d07742c57b88..9bf45f2be966c 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -367,3 +367,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 541bc1b3a8f95..f1f98ee6c82de 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -446,3 +446,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a176faca2927d..da49ddd4bb540 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -452,3 +452,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index c2d2e19abea8e..56c8d3cf42edf 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -385,3 +385,5 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n32 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index ac653d08b1ea1..1ca7bc337932b 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -361,3 +361,5 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n64 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 253f2cd70b6ba..fd3a9df60ec27 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -434,3 +434,5 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 o32 process_mrelease sys_process_mrelease diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index e26187b9ab87b..040df1b7a5892 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -444,3 +444,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index aef2a290e71a3..d8ebd7d37c0f4 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -526,3 +526,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 64d51ab5a8b45..57233ace30cb5 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index e0a70be77d848..2f6e95eb46900 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 603f5a8215028..42fc2906215dd 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -492,3 +492,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ce763a12311cc..661a03bcfbd14 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -452,3 +452,4 @@ 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret +448 i386 process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f6b57799c1ea2..807b6a1de8e8d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -369,6 +369,7 @@ 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 235d67d6ceb45..f4384951f3931 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -417,3 +417,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081b..00bc170a50f01 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index a9d6fcd95f42a..14c8fe863c6d8 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #define __NR_memfd_secret 447 __SYSCALL(__NR_memfd_secret, sys_memfd_secret) #endif +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) #undef __NR_syscalls -#define __NR_syscalls 448 +#define __NR_syscalls 449 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a92..18a9c2cde7675 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -289,6 +289,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); From 88a3014412e7b6d0ba6d044c25e8559b16e8e558 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 24 Aug 2021 09:59:37 +1000 Subject: [PATCH 225/365] oom_kill: oom_score_adj broken for processes with small memory usage If you have a process with less than 1000 totalpages, the calculation: adj = (long)p->signal->oom_score_adj; ... adj *= totalpages / 1000; will always result in adj being zero no matter what oom_score_adj is, which could result in the wrong process being picked for killing. Fix by adding 1000 to totalpages before dividing. I ran across this trying to diagnose another problem where I set up a cgroup with a small amount of memory and couldn't get a test program to work right. I'm not sure this is quite right, to keep closer to the current behavior you could do: if (totalpages >= 1000) adj *= totalpages / 1000; but that would map 0-1999 to the same value. But this at least shows the issue. I can provide a test program the shows the issue, but I think it's pretty obvious from the code. Link: https://lkml.kernel.org/r/20210701125430.836308-1-minyard@acm.org Signed-off-by: Corey Minyard Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/oom_kill.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 831340e7ad8b4..431d38c3bba8e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -234,8 +234,11 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); - /* Normalize to oom_score_adj units */ - adj *= totalpages / 1000; + /* + * Normalize to oom_score_adj units. You should never + * multiply by zero here, or oom_score_adj will not work. + */ + adj *= (totalpages + 1000) / 1000; points += adj; return points; From 68bfd4075d5ae34be4c146a23c17125a3c659a2d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Aug 2021 09:59:37 +1000 Subject: [PATCH 226/365] mm/migrate: correct kernel-doc notation Use the expected "Return:" format to prevent a kernel-doc warning. mm/migrate.c:1157: warning: Excess function parameter 'returns' description in 'next_demotion_node' Link: https://lkml.kernel.org/r/20210808203151.10632-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index ae923e9b88743..a0aeb3fe46a7c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1149,7 +1149,7 @@ static int node_demotion[MAX_NUMNODES] __read_mostly = * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node * - * @returns: node id for next memory node in the demotion path hierarchy + * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. From 883146b11ef19d4466be13939c8605c614819cc8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 24 Aug 2021 09:59:37 +1000 Subject: [PATCH 227/365] mm/thp: make ALLOC_SPLIT_PTLOCKS dependent on USE_SPLIT_PTE_PTLOCKS Split ptlocks need not be defined and allocated unless they are being used. ALLOC_SPLIT_PTLOCKS is inherently dependent on USE_SPLIT_PTE_PTLOCKS. This just makes it explicit and clear. While here drop the spinlock_t element from the struct page when USE_SPLIT_PTE_PTLOCKS is not enabled. Link: https://lkml.kernel.org/r/1621409586-5555-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Randy Dunlap # build-tested Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm_types.h | 2 ++ include/linux/mm_types_task.h | 5 +++++ mm/memory.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 52bbd2b7cb465..f37abb2d222e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,10 +159,12 @@ struct page { struct mm_struct *pt_mm; /* x86 pgds only */ atomic_t pt_frag_refcount; /* powerpc */ }; +#if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; +#endif #endif }; struct { /* ZONE_DEVICE pages */ diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index c1bc6731125cb..1b222f8039d10 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -22,7 +22,12 @@ #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) + +#if USE_SPLIT_PTE_PTLOCKS #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) +#else +#define ALLOC_SPLIT_PTLOCKS 0 +#endif /* * The per task VMA cache array: diff --git a/mm/memory.c b/mm/memory.c index 25fc46e872142..94a17a9a48ab2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5441,7 +5441,7 @@ long copy_huge_page_from_user(struct page *dst_page, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS +#if ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; From c47e74219d4b911c492b9c676a3cedacd1d6f95b Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:37 +1000 Subject: [PATCH 228/365] selftests: vm: add KSM merge test Patch series "add KSM selftests". Introduce selftests to validate the functionality of KSM. The tests are run on private anonymous pages. Since some KSM tunables are modified, their starting values are saved and restored after testing. At the start, run is set to 2 to ensure that only test pages will be merged (we assume that no applications make madvise syscalls in the background). If KSM config not enabled, all tests will be skipped. This patch (of 4): Add check_ksm_merge() function to check the basic merging feature of KSM. First, some number of identical pages are allocated and the MADV_MERGEABLE advice is given to merge these pages. Then, pages_shared and pages_sharing values are compared with the expected numbers using assert_ksm_pages_count() function. The number of pages can be changed using -p option. Link: https://lkml.kernel.org/r/cover.1626252248.git.zhansayabagdaulet@gmail.com Link: https://lkml.kernel.org/r/90287685c13300972ea84de93d1f3f900373f9fe.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Shuah Khan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/ksm_tests.c | 306 ++++++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 16 ++ 4 files changed, 324 insertions(+) create mode 100644 tools/testing/selftests/vm/ksm_tests.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index f0fd80ef17dfc..b02eac613fdda 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -27,3 +27,4 @@ hmm-tests memfd_secret local_config.* split_huge_page_test +ksm_tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 521243770f26d..e6f22a801b71a 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -45,6 +45,7 @@ TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd TEST_GEN_FILES += split_huge_page_test +TEST_GEN_FILES += ksm_tests ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c new file mode 100644 index 0000000000000..d74d5aa34b167 --- /dev/null +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include "../kselftest.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + return 1; + } + fclose(f); + + return 0; +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + printf(" -a: specify the access protections of pages.\n" + " must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + + while ((opt = getopt(argc, argv, "ha:p:l:")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, + page_size); + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index d09a6b71f1e9d..97b6f712134d7 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -377,6 +377,22 @@ else exitcode=1 fi +echo "-------------------------------------------------------" +echo "running KSM MADV_MERGEABLE test with 10 identical pages" +echo "-------------------------------------------------------" +./ksm_tests -p 10 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode exit $exitcode From a113a89432619f1963c85d183bfec7e949ea039a Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:38 +1000 Subject: [PATCH 229/365] selftests: vm: add KSM unmerge test Add check_ksm_unmerge() function to verify that KSM is properly unmerging shared pages. For this, two duplicate pages are merged first and then their contents are modified. Since they are not identical anymore, the pages must be unmerged and the number of merged pages has to be 0. The test is run as follows: ./ksm_tests -U Link: https://lkml.kernel.org/r/c0f55420440d704d5b094275b4365aa1b2ad46b5.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/ksm_tests.c | 72 +++++++++++++++++++++-- tools/testing/selftests/vm/run_vmtests.sh | 18 +++++- 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index d74d5aa34b167..80302bb8f64cb 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -23,6 +23,11 @@ struct ksm_sysfs { unsigned long use_zero_pages; }; +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE +}; + static int ksm_write_sysfs(const char *file_path, unsigned long val) { FILE *f = fopen(file_path, "w"); @@ -75,7 +80,12 @@ static int str_to_prot(char *prot_str) static void print_help(void) { - printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + + printf("Supported :\n" + " -M (page merging)\n" + " -U (page unmerging)\n\n"); + printf(" -a: specify the access protections of pages.\n" " must be of the form [rwx].\n" " Default: %s\n", KSM_PROT_STR_DEFAULT); @@ -239,6 +249,46 @@ static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, return KSFT_FAIL; } +static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -247,8 +297,9 @@ int main(int argc, char *argv[]) long page_count = KSM_PAGE_COUNT_DEFAULT; size_t page_size = sysconf(_SC_PAGESIZE); struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; - while ((opt = getopt(argc, argv, "ha:p:l:")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:MU")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -270,6 +321,11 @@ int main(int argc, char *argv[]) case 'h': print_help(); break; + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; default: return KSFT_FAIL; } @@ -294,8 +350,16 @@ int main(int argc, char *argv[]) ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) return KSFT_FAIL; - ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, - page_size); + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + } if (ksm_restore(&ksm_sysfs_old)) { printf("Cannot restore default tunables\n"); diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 97b6f712134d7..3a23c6b47da28 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -380,7 +380,23 @@ fi echo "-------------------------------------------------------" echo "running KSM MADV_MERGEABLE test with 10 identical pages" echo "-------------------------------------------------------" -./ksm_tests -p 10 +./ksm_tests -M -p 10 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "------------------------" +echo "running KSM unmerge test" +echo "------------------------" +./ksm_tests -U ret_val=$? if [ $ret_val -eq 0 ]; then From 3f579f1a3f864a53af841efcfc21c4a46c741471 Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:38 +1000 Subject: [PATCH 230/365] selftests: vm: add KSM zero page merging test Add check_ksm_zero_page_merge() function to test that empty pages are being handled properly. For this, several zero pages are allocated and merged using madvise. If use_zero_pages is enabled, the pages must be shared with the special kernel zero pages; otherwise, they are merged as usual duplicate pages. The test is run as follows: ./ksm_tests -Z Link: https://lkml.kernel.org/r/6d0caab00d4bdccf5e3791cb95cf6dfd5eb85e45.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/ksm_tests.c | 70 ++++++++++++++++++++++- tools/testing/selftests/vm/run_vmtests.sh | 32 +++++++++++ 2 files changed, 99 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 80302bb8f64cb..5843526471e13 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -12,6 +12,7 @@ #define KSM_SCAN_LIMIT_SEC_DEFAULT 120 #define KSM_PAGE_COUNT_DEFAULT 10l #define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false struct ksm_sysfs { unsigned long max_page_sharing; @@ -25,7 +26,8 @@ struct ksm_sysfs { enum ksm_test_name { CHECK_KSM_MERGE, - CHECK_KSM_UNMERGE + CHECK_KSM_UNMERGE, + CHECK_KSM_ZERO_PAGE_MERGE }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -80,10 +82,12 @@ static int str_to_prot(char *prot_str) static void print_help(void) { - printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages]\n"); printf("Supported :\n" " -M (page merging)\n" + " -Z (zero pages merging)\n" " -U (page unmerging)\n\n"); printf(" -a: specify the access protections of pages.\n" @@ -93,6 +97,8 @@ static void print_help(void) " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); printf(" -l: limit the maximum running time (in seconds) for a test.\n" " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); exit(0); } @@ -289,6 +295,50 @@ static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_siz return KSFT_FAIL; } +static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, + bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -298,8 +348,9 @@ int main(int argc, char *argv[]) size_t page_size = sysconf(_SC_PAGESIZE); struct ksm_sysfs ksm_sysfs_old; int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; - while ((opt = getopt(argc, argv, "ha:p:l:MU")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:MUZ")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -321,11 +372,20 @@ int main(int argc, char *argv[]) case 'h': print_help(); break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; case 'M': break; case 'U': test_name = CHECK_KSM_UNMERGE; break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; default: return KSFT_FAIL; } @@ -359,6 +419,10 @@ int main(int argc, char *argv[]) ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, use_zero_pages, page_size); + break; } if (ksm_restore(&ksm_sysfs_old)) { diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 3a23c6b47da28..9b4e444fc4ed0 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -409,6 +409,38 @@ else exitcode=1 fi +echo "----------------------------------------------------------" +echo "running KSM test with 10 zero pages and use_zero_pages = 0" +echo "----------------------------------------------------------" +./ksm_tests -Z -p 10 -z 0 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "----------------------------------------------------------" +echo "running KSM test with 10 zero pages and use_zero_pages = 1" +echo "----------------------------------------------------------" +./ksm_tests -Z -p 10 -z 1 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode exit $exitcode From 4081d843008614f068f76810a79fb94504ef0f3d Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:38 +1000 Subject: [PATCH 231/365] selftests: vm: add KSM merging across nodes test Add check_ksm_numa_merge() function to test that pages in different NUMA nodes are being handled properly. First, two duplicate pages are allocated in two separate NUMA nodes using the libnuma library. Since there is one unique page in each node, with merge_across_nodes = 0, there won't be any shared pages. If merge_across_nodes is set to 1, the pages will be treated as usual duplicate pages and will be merged. If NUMA config is not enabled or the number of NUMA nodes is less than two, then the test is skipped. The test is run as follows: ./ksm_tests -N Link: https://lkml.kernel.org/r/071c17b5b04ebb0dfeba137acc495e5dd9d2a719.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/Makefile | 2 + tools/testing/selftests/vm/ksm_tests.c | 88 ++++++++++++++++++++++- tools/testing/selftests/vm/run_vmtests.sh | 32 +++++++++ 3 files changed, 119 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e6f22a801b71a..d9605bd10f2de 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -146,6 +146,8 @@ $(OUTPUT)/hmm-tests: local_config.h # HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. $(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS) +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + local_config.mk local_config.h: check_config.sh /bin/sh ./check_config.sh $(CC) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 5843526471e13..cdeb4a028538d 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -13,6 +14,7 @@ #define KSM_PAGE_COUNT_DEFAULT 10l #define KSM_PROT_STR_DEFAULT "rw" #define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true struct ksm_sysfs { unsigned long max_page_sharing; @@ -27,7 +29,8 @@ struct ksm_sysfs { enum ksm_test_name { CHECK_KSM_MERGE, CHECK_KSM_UNMERGE, - CHECK_KSM_ZERO_PAGE_MERGE + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -83,11 +86,12 @@ static int str_to_prot(char *prot_str) static void print_help(void) { printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages]\n"); + "[-z use_zero_pages] [-m merge_across_nodes]\n"); printf("Supported :\n" " -M (page merging)\n" " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" " -U (page unmerging)\n\n"); printf(" -a: specify the access protections of pages.\n" @@ -99,6 +103,8 @@ static void print_help(void) " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); printf(" -z: change use_zero_pages tunable\n" " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); exit(0); } @@ -339,6 +345,68 @@ static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int return KSFT_FAIL; } +static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, + size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_max_node() < 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + numa1_map_ptr = numa_alloc_onnode(page_size, 0); + numa2_map_ptr = numa_alloc_onnode(page_size, 1); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -349,8 +417,9 @@ int main(int argc, char *argv[]) struct ksm_sysfs ksm_sysfs_old; int test_name = CHECK_KSM_MERGE; bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; - while ((opt = getopt(argc, argv, "ha:p:l:z:MUZ")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:MUZN")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -378,6 +447,12 @@ int main(int argc, char *argv[]) else use_zero_pages = 1; break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; case 'M': break; case 'U': @@ -386,6 +461,9 @@ int main(int argc, char *argv[]) case 'Z': test_name = CHECK_KSM_ZERO_PAGE_MERGE; break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; default: return KSFT_FAIL; } @@ -423,6 +501,10 @@ int main(int argc, char *argv[]) ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, use_zero_pages, page_size); break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + merge_across_nodes, page_size); + break; } if (ksm_restore(&ksm_sysfs_old)) { diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 9b4e444fc4ed0..45e803af7c775 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -441,6 +441,38 @@ else exitcode=1 fi +echo "-------------------------------------------------------------" +echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1" +echo "-------------------------------------------------------------" +./ksm_tests -N -m 1 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "-------------------------------------------------------------" +echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0" +echo "-------------------------------------------------------------" +./ksm_tests -N -m 0 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode exit $exitcode From 1cf855e958722a25ece64afe453122905b6d5d6a Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:38 +1000 Subject: [PATCH 232/365] mm: KSM: fix data type ksm_stable_node_chains_prune_millisecs is declared as int, but in stable__node_chains_prune_millisecs_store(), it can store values up to UINT_MAX. Change its type to unsigned int. Link: https://lkml.kernel.org/r/20210806111351.GA71845@asus Signed-off-by: Zhansaya Bagdauletkyzy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/ksm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 3fa9bc8a67cf6..025338128cd92 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -259,7 +259,7 @@ static unsigned long ksm_stable_node_chains; static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ -static int ksm_stable_node_chains_prune_millisecs = 2000; +static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; @@ -3105,11 +3105,11 @@ stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; From 347612c5267f3763a55a3e792702ac364e3303ad Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:38 +1000 Subject: [PATCH 233/365] selftests: vm: add KSM merging time test Patch series "add KSM performance tests", v3. Extend KSM self tests with a performance benchmark. These tests are not part of regular regression testing, as they are mainly intended to be used by developers making changes to the memory management subsystem. This patch (of 2): Add ksm_merge_time() function to determine speed and time needed for merging. The total spent time is shown in seconds while speed is in MiB/s. User must specify the size of duplicated memory area (in MiB) before running the test. The test is run as follows: ./ksm_tests -P -s 100 The output: Total size: 100 MiB Total time: 0.201106786 s Average speed: 497.248 MiB/s Link: https://lkml.kernel.org/r/cover.1629386192.git.zhansayabagdaulet@gmail.com Link: https://lkml.kernel.org/r/318b946ac80cc9205c89d0962048378f7ce0705b.1629386192.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Tyler Hicks Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/ksm_tests.c | 74 ++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index cdeb4a028538d..432dfe615e506 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -7,6 +7,7 @@ #include #include "../kselftest.h" +#include "../../../../include/vdso/time64.h" #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" #define KSM_FP(s) (KSM_SYSFS_PATH s) @@ -15,6 +16,7 @@ #define KSM_PROT_STR_DEFAULT "rw" #define KSM_USE_ZERO_PAGES_DEFAULT false #define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define MB (1ul << 20) struct ksm_sysfs { unsigned long max_page_sharing; @@ -30,7 +32,8 @@ enum ksm_test_name { CHECK_KSM_MERGE, CHECK_KSM_UNMERGE, CHECK_KSM_ZERO_PAGE_MERGE, - CHECK_KSM_NUMA_MERGE + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -86,13 +89,16 @@ static int str_to_prot(char *prot_str) static void print_help(void) { printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages] [-m merge_across_nodes]\n"); + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); printf("Supported :\n" " -M (page merging)\n" " -Z (zero pages merging)\n" " -N (merging of pages in different NUMA nodes)\n" - " -U (page unmerging)\n\n"); + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n\n"); printf(" -a: specify the access protections of pages.\n" " must be of the form [rwx].\n" @@ -105,6 +111,7 @@ static void print_help(void) " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); printf(" -m: change merge_across_nodes tunable\n" " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -s: the size of duplicated memory area (in MiB)\n"); exit(0); } @@ -407,6 +414,47 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a return KSFT_FAIL; } +static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -418,8 +466,9 @@ int main(int argc, char *argv[]) int test_name = CHECK_KSM_MERGE; bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:MUZN")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNP")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -453,6 +502,12 @@ int main(int argc, char *argv[]) else merge_across_nodes = 1; break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } case 'M': break; case 'U': @@ -464,6 +519,9 @@ int main(int argc, char *argv[]) case 'N': test_name = CHECK_KSM_NUMA_MERGE; break; + case 'P': + test_name = KSM_MERGE_TIME; + break; default: return KSFT_FAIL; } @@ -505,6 +563,14 @@ int main(int argc, char *argv[]) ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, merge_across_nodes, page_size); break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + size_MB); + break; } if (ksm_restore(&ksm_sysfs_old)) { From 8767db7ea52cabdf34c5765c8e923be567f6340e Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Tue, 24 Aug 2021 09:59:38 +1000 Subject: [PATCH 234/365] selftests: vm: add COW time test for KSM pages Since merged pages are copied every time they need to be modified, the write access time is different between shared and non-shared pages. Add ksm_cow_time() function which evaluates latency of these COW breaks. First, 4000 pages are allocated and the time, required to modify 1 byte in every other page, is measured. After this, the pages are merged into 2000 pairs and in each pair, 1 page is modified (i.e. they are decoupled) to detect COW breaks. The time needed to break COW of merged pages is then compared with performance of non-shared pages. The test is run as follows: ./ksm_tests -C The output: Total size: 15 MiB Not merged pages: Total time: 0.002185489 s Average speed: 3202.945 MiB/s Merged pages: Total time: 0.004386872 s Average speed: 1595.670 MiB/s Link: https://lkml.kernel.org/r/1d03ee0d1b341959d4b61672c6401d498bff5652.1629386192.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Tyler Hicks Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/ksm_tests.c | 86 +++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 432dfe615e506..b61dcdb44c5be 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -33,7 +33,8 @@ enum ksm_test_name { CHECK_KSM_UNMERGE, CHECK_KSM_ZERO_PAGE_MERGE, CHECK_KSM_NUMA_MERGE, - KSM_MERGE_TIME + KSM_MERGE_TIME, + KSM_COW_TIME }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -98,7 +99,8 @@ static void print_help(void) " -U (page unmerging)\n" " -P evaluate merging time and speed.\n" " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n\n"); + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); printf(" -a: specify the access protections of pages.\n" " must be of the form [rwx].\n" @@ -455,6 +457,77 @@ static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) return KSFT_FAIL; } +static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -468,7 +541,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNP")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -522,6 +595,9 @@ int main(int argc, char *argv[]) case 'P': test_name = KSM_MERGE_TIME; break; + case 'C': + test_name = KSM_COW_TIME; + break; default: return KSFT_FAIL; } @@ -571,6 +647,10 @@ int main(int argc, char *argv[]) ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; + case KSM_COW_TIME: + ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; } if (ksm_restore(&ksm_sysfs_old)) { From 52bbb62ee48fe2a27b0412ccab9eb32817dd865c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:39 +1000 Subject: [PATCH 235/365] mm/vmstat: correct some wrong comments Patch series "Cleanup for vmstat". This series contains cleanups to remove unneeded return value, correct wrong comment and simplify the array size calculation. More details can be found in the respective changelogs. This patch (of 3): Correct wrong fls(mem+1) to fls(mem)+1 and remove the duplicated comment with quiet_vmstat(). Link: https://lkml.kernel.org/r/20210715122911.15700-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210715122911.15700-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmstat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index ec5a2e789dd2e..5bd621eadc482 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -204,7 +204,7 @@ int calculate_normal_threshold(struct zone *zone) * * Some sample thresholds: * - * Threshold Processors (fls) Zonesize fls(mem+1) + * Threshold Processors (fls) Zonesize fls(mem)+1 * ------------------------------------------------------------------ * 8 1 1 0.9-1 GB 4 * 16 2 2 0.9-1 GB 4 @@ -1875,11 +1875,6 @@ static void vmstat_update(struct work_struct *w) } } -/* - * Switch off vmstat processing and then fold all the remaining differentials - * until the diffs stay at zero. The function is used by NOHZ and can only be - * invoked when tick processing is not active. - */ /* * Check if the diffs for a certain cpu indicate that * an update is needed. From 251f8851237cbf3bca20be17f69076d09da3f30b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:39 +1000 Subject: [PATCH 236/365] mm/vmstat: simplify the array size calculation We can replace the array_num * sizeof(array[0]) with sizeof(array) to simplify the code. Link: https://lkml.kernel.org/r/20210715122911.15700-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmstat.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 5bd621eadc482..c4634dc83916b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1891,17 +1891,15 @@ static bool need_update(int cpu) /* * The fast way of checking if there are any vmstat diffs. */ - if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * - sizeof(pzstats->vm_stat_diff[0]))) + if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff))) return true; if (last_pgdat == zone->zone_pgdat) continue; last_pgdat = zone->zone_pgdat; n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); - if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS * - sizeof(n->vm_node_stat_diff[0]))) - return true; + if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff))) + return true; } return false; } From d01147d18f8e658b17cd6d5fab851c2157e258c1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:39 +1000 Subject: [PATCH 237/365] mm/vmstat: remove unneeded return value The return value of pagetypeinfo_showfree and pagetypeinfo_showblockcount are unused now. Remove them. Link: https://lkml.kernel.org/r/20210715122911.15700-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmstat.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index c4634dc83916b..13ff25d0d96a4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1454,7 +1454,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, } /* Print out the free pages at each order for each migatetype */ -static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +static void pagetypeinfo_showfree(struct seq_file *m, void *arg) { int order; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1466,8 +1466,6 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); - - return 0; } static void pagetypeinfo_showblockcount_print(struct seq_file *m, @@ -1503,7 +1501,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, } /* Print out the number of pageblocks for each migratetype */ -static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1514,8 +1512,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showblockcount_print); - - return 0; } /* From 15fd61a615d038c1546f5b590fbdd49a9812b5c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Aug 2021 09:59:39 +1000 Subject: [PATCH 238/365] mm/vmstat: protect per cpu variables with preempt disable on RT Disable preemption on -RT for the vmstat code. On vanila the code runs in IRQ-off regions while on -RT it may not when stats are updated under a local_lock. "preempt_disable" ensures that the same resources is not updated in parallel due to preemption. This patch differs from the preempt-rt version where __count_vm_event and __count_vm_events are also protected. The counters are explicitly "allowed to be to be racy" so there is no need to protect them from preemption. Only the accurate page stats that are updated by a read-modify-write need protection. This patch also differs in that a preempt_[en|dis]able_rt helper is not used. As vmstat is the only user of the helper, it was suggested that it be open-coded in vmstat.c instead of risking the helper being used in unnecessary contexts. Link: https://lkml.kernel.org/r/20210805160019.1137-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmstat.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 13ff25d0d96a4..3b6a69496f481 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -319,6 +319,16 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long x; long t; + /* + * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, + * atomicity is provided by IRQs being disabled -- either explicitly + * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables + * CPU migrations and preemption potentially corrupts a counter so + * disable preemption. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -328,6 +338,9 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -350,6 +363,10 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, delta >>= PAGE_SHIFT; } + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -359,6 +376,9 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -391,6 +411,10 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -399,6 +423,9 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v + overstep, zone, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -409,6 +436,10 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -417,6 +448,9 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -437,6 +471,10 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -445,6 +483,9 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v - overstep, zone, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -455,6 +496,10 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -463,6 +508,9 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) From c9c3dca84331d7afd1c4522e22b3d8ed17071b80 Mon Sep 17 00:00:00 2001 From: zhangkui Date: Tue, 24 Aug 2021 09:59:39 +1000 Subject: [PATCH 239/365] mm/madvise: add MADV_WILLNEED to process_madvise() There is a usecase in Android that an app process's memory is swapped out by process_madvise() with MADV_PAGEOUT, such as the memory is swapped to zram or a backing device. When the process is scheduled to running, like switch to foreground, multiple page faults may cause the app dropped frames. To reduce the problem, System Management Software can read-ahead memory of the process immediately when the app switches to forground. Calling process_madvise() with MADV_WILLNEED can meet this need. Link: https://lkml.kernel.org/r/20210804082010.12482-1-zhangkui@oppo.com Signed-off-by: zhangkui Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/madvise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/madvise.c b/mm/madvise.c index 5c065bc8b5f6c..4a15a83031f6a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1048,6 +1048,7 @@ process_madvise_behavior_valid(int behavior) switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: + case MADV_WILLNEED: return true; default: return false; From 9ba945a5f5e3aefba5ee14885433b85ea71f1123 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:40 +1000 Subject: [PATCH 240/365] memory-hotplug.rst: remove locking details from admin-guide Patch series "memory-hotplug.rst: complete admin-guide overhaul", v3. This patch (of 2): We have the same content at Documentation/core-api/memory-hotplug.rst and it doesn't fit into the admin-guide. The documentation was accidentially duplicated when merging. Link: https://lkml.kernel.org/r/20210707073205.3835-1-david@redhat.com Link: https://lkml.kernel.org/r/20210707073205.3835-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Mike Kravetz Cc: Dave Hansen Cc: Matthew Wilcox Cc: Anshuman Khandual Cc: Muchun Song Cc: Pavel Tatashin Cc: Jonathan Corbet Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/mm/memory-hotplug.rst | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index c6bae2d771609..a783cf7c8e4c2 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -415,45 +415,6 @@ Need more implementation yet.... - Guard from remove if not yet. -Locking Internals -================= - -When adding/removing memory that uses memory block devices (i.e. ordinary RAM), -the device_hotplug_lock should be held to: - -- synchronize against online/offline requests (e.g. via sysfs). This way, memory - block devices can only be accessed (.online/.state attributes) by user - space once memory has been fully added. And when removing memory, we - know nobody is in critical sections. -- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC) - -Especially, there is a possible lock inversion that is avoided using -device_hotplug_lock when adding memory and user space tries to online that -memory faster than expected: - -- device_online() will first take the device_lock(), followed by - mem_hotplug_lock -- add_memory_resource() will first take the mem_hotplug_lock, followed by - the device_lock() (while creating the devices, during bus_add_device()). - -As the device is visible to user space before taking the device_lock(), this -can result in a lock inversion. - -onlining/offlining of memory should be done via device_online()/ -device_offline() - to make sure it is properly synchronized to actions -via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type) - -When adding/removing/onlining/offlining memory or adding/removing -heterogeneous/device memory, we should always hold the mem_hotplug_lock in -write mode to serialise memory hotplug (e.g. access to global/zone -variables). - -In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read -mode allows for a quite efficient get_online_mems/put_online_mems -implementation, so code accessing memory can protect from that memory -vanishing. - - Future Work =========== From 7f640ec87cbe5a667cfe6bf059fc06077aee1315 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:40 +1000 Subject: [PATCH 241/365] memory-hotplug.rst: complete admin-guide overhaul The memory hot(un)plug documentation is outdated and incomplete. Most of the content dates back to 2007, so it's time for a major overhaul. Let's rewrite, reorganize and update most parts of the documentation. In addition to memory hot(un)plug, also add some details regarding ZONE_MOVABLE, with memory hotunplug being one of its main consumers. Drop the file history, that information can more reliably be had from the git log. The style of the document is also properly fixed that e.g., "restview" renders it cleanly now. In the future, we might add some more details about virt users like virtio-mem, the XEN balloon, the Hyper-V balloon and ppc64 dlpar. Link: https://lkml.kernel.org/r/20210707073205.3835-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Mike Kravetz Cc: Dave Hansen Cc: Matthew Wilcox Cc: Anshuman Khandual Cc: Muchun Song Cc: Pavel Tatashin Cc: Jonathan Corbet Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/mm/memory-hotplug.rst | 761 +++++++++++------- 1 file changed, 455 insertions(+), 306 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index a783cf7c8e4c2..03dfbc9252529 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -1,427 +1,576 @@ .. _admin_guide_memory_hotplug: -============== -Memory Hotplug -============== +================== +Memory Hot(Un)Plug +================== -:Created: Jul 28 2007 -:Updated: Add some details about locking internals: Aug 20 2018 - -This document is about memory hotplug including how-to-use and current status. -Because Memory Hotplug is still under development, contents of this text will -be changed often. +This document describes generic Linux support for memory hot(un)plug with +a focus on System RAM, including ZONE_MOVABLE support. .. contents:: :local: -.. note:: +Introduction +============ - (1) x86_64's has special implementation for memory hotplug. - This text does not describe it. - (2) This text assumes that sysfs is mounted at ``/sys``. +Memory hot(un)plug allows for increasing and decreasing the size of physical +memory available to a machine at runtime. In the simplest case, it consists of +physically plugging or unplugging a DIMM at runtime, coordinated with the +operating system. +Memory hot(un)plug is used for various purposes: -Introduction -============ +- The physical memory available to a machine can be adjusted at runtime, up- or + downgrading the memory capacity. This dynamic memory resizing, sometimes + referred to as "capacity on demand", is frequently used with virtual machines + and logical partitions. + +- Replacing hardware, such as DIMMs or whole NUMA nodes, without downtime. One + example is replacing failing memory modules. -Purpose of memory hotplug -------------------------- +- Reducing energy consumption either by physically unplugging memory modules or + by logically unplugging (parts of) memory modules from Linux. -Memory Hotplug allows users to increase/decrease the amount of memory. -Generally, there are two purposes. +Further, the basic memory hot(un)plug infrastructure in Linux is nowadays also +used to expose persistent memory, other performance-differentiated memory and +reserved memory regions as ordinary system RAM to Linux. -(A) For changing the amount of memory. - This is to allow a feature like capacity on demand. -(B) For installing/removing DIMMs or NUMA-nodes physically. - This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc. +Linux only supports memory hot(un)plug on selected 64 bit architectures, such as +x86_64, arm64, ppc64, s390x and ia64. -(A) is required by highly virtualized environments and (B) is required by -hardware which supports memory power management. +Memory Hot(Un)Plug Granularity +------------------------------ -Linux memory hotplug is designed for both purpose. +Memory hot(un)plug in Linux uses the SPARSEMEM memory model, which divides the +physical memory address space into chunks of the same size: memory sections. The +size of a memory section is architecture dependent. For example, x86_64 uses +128 MiB and ppc64 uses 16 MiB. -Phases of memory hotplug +Memory sections are combined into chunks referred to as "memory blocks". The +size of a memory block is architecture dependent and corresponds to the smallest +granularity that can be hot(un)plugged. The default size of a memory block is +the same as memory section size, unless an architecture specifies otherwise. + +All memory blocks have the same size. + +Phases of Memory Hotplug ------------------------ -There are 2 phases in Memory Hotplug: +Memory hotplug consists of two phases: - 1) Physical Memory Hotplug phase - 2) Logical Memory Hotplug phase. +(1) Adding the memory to Linux +(2) Onlining memory blocks -The First phase is to communicate hardware/firmware and make/erase -environment for hotplugged memory. Basically, this phase is necessary -for the purpose (B), but this is good phase for communication between -highly virtualized environments too. +In the first phase, metadata, such as the memory map ("memmap") and page tables +for the direct mapping, is allocated and initialized, and memory blocks are +created; the latter also creates sysfs files for managing newly created memory +blocks. -When memory is hotplugged, the kernel recognizes new memory, makes new memory -management tables, and makes sysfs files for new memory's operation. +In the second phase, added memory is exposed to the page allocator. After this +phase, the memory is visible in memory statistics, such as free and total +memory, of the system. -If firmware supports notification of connection of new memory to OS, -this phase is triggered automatically. ACPI can notify this event. If not, -"probe" operation by system administration is used instead. -(see :ref:`memory_hotplug_physical_mem`). +Phases of Memory Hotunplug +-------------------------- -Logical Memory Hotplug phase is to change memory state into -available/unavailable for users. Amount of memory from user's view is -changed by this phase. The kernel makes all memory in it as free pages -when a memory range is available. +Memory hotunplug consists of two phases: -In this document, this phase is described as online/offline. +(1) Offlining memory blocks +(2) Removing the memory from Linux -Logical Memory Hotplug phase is triggered by write of sysfs file by system -administrator. For the hot-add case, it must be executed after Physical Hotplug -phase by hand. -(However, if you writes udev's hotplug scripts for memory hotplug, these -phases can be execute in seamless way.) +In the fist phase, memory is "hidden" from the page allocator again, for +example, by migrating busy memory to other memory locations and removing all +relevant free pages from the page allocator After this phase, the memory is no +longer visible in memory statistics of the system. -Unit of Memory online/offline operation ---------------------------------------- +In the second phase, the memory blocks are removed and metadata is freed. -Memory hotplug uses SPARSEMEM memory model which allows memory to be divided -into chunks of the same size. These chunks are called "sections". The size of -a memory section is architecture dependent. For example, power uses 16MiB, ia64 -uses 1GiB. +Memory Hotplug Notifications +============================ -Memory sections are combined into chunks referred to as "memory blocks". The -size of a memory block is architecture dependent and represents the logical -unit upon which memory online/offline operations are to be performed. The -default size of a memory block is the same as memory section size unless an -architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.) +There are various ways how Linux is notified about memory hotplug events such +that it can start adding hotplugged memory. This description is limited to +systems that support ACPI; mechanisms specific to other firmware interfaces or +virtual machines are not described. -To determine the size (in bytes) of a memory block please read this file:: +ACPI Notifications +------------------ - /sys/devices/system/memory/block_size_bytes +Platforms that support ACPI, such as x86_64, can support memory hotplug +notifications via ACPI. -Kernel Configuration -==================== +In general, a firmware supporting memory hotplug defines a memory class object +HID "PNP0C80". When notified about hotplug of a new memory device, the ACPI +driver will hotplug the memory to Linux. -To use memory hotplug feature, kernel must be compiled with following -config options. +If the firmware supports hotplug of NUMA nodes, it defines an object _HID +"ACPI0004", "PNP0A05", or "PNP0A06". When notified about an hotplug event, all +assigned memory devices are added to Linux by the ACPI driver. -- For all memory hotplug: - - Memory model -> Sparse Memory (``CONFIG_SPARSEMEM``) - - Allow for memory hot-add (``CONFIG_MEMORY_HOTPLUG``) +Similarly, Linux can be notified about requests to hotunplug a memory device or +a NUMA node via ACPI. The ACPI driver will try offlining all relevant memory +blocks, and, if successful, hotunplug the memory from Linux. -- To enable memory removal, the following are also necessary: - - Allow for memory hot remove (``CONFIG_MEMORY_HOTREMOVE``) - - Page Migration (``CONFIG_MIGRATION``) +Manual Probing +-------------- -- For ACPI memory hotplug, the following are also necessary: - - Memory hotplug (under ACPI Support menu) (``CONFIG_ACPI_HOTPLUG_MEMORY``) - - This option can be kernel module. +On some architectures, the firmware may not be able to notify the operating +system about a memory hotplug event. Instead, the memory has to be manually +probed from user space. -- As a related configuration, if your box has a feature of NUMA-node hotplug - via ACPI, then this option is necessary too. +The probe interface is located at:: - - ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu) - (``CONFIG_ACPI_CONTAINER``). + /sys/devices/system/memory/probe - This option can be kernel module too. +Only complete memory blocks can be probed. Individual memory blocks are probed +by providing the physical start address of the memory block:: + % echo addr > /sys/devices/system/memory/probe -.. _memory_hotplug_sysfs_files: +Which results in a memory block for the range [addr, addr + memory_block_size) +being created. -sysfs files for memory hotplug -============================== +.. note:: -All memory blocks have their device information in sysfs. Each memory block -is described under ``/sys/devices/system/memory`` as:: + Using the probe interface is discouraged as it is easy to crash the kernel, + because Linux cannot validate user input; this interface might be removed in + the future. - /sys/devices/system/memory/memoryXXX +Onlining and Offlining Memory Blocks +==================================== -where XXX is the memory block id. +After a memory block has been created, Linux has to be instructed to actually +make use of that memory: the memory block has to be "online". -For the memory block covered by the sysfs directory. It is expected that all -memory sections in this range are present and no memory holes exist in the -range. Currently there is no way to determine if there is a memory hole, but -the existence of one should not affect the hotplug capabilities of the memory -block. +Before a memory block can be removed, Linux has to stop using any memory part of +the memory block: the memory block has to be "offlined". -For example, assume 1GiB memory block size. A device for a memory starting at -0x100000000 is ``/sys/device/system/memory/memory4``:: +The Linux kernel can be configured to automatically online added memory blocks +and drivers automatically trigger offlining of memory blocks when trying +hotunplug of memory. Memory blocks can only be removed once offlining succeeded +and drivers may trigger offlining of memory blocks when attempting hotunplug of +memory. - (0x100000000 / 1Gib = 4) +Onlining Memory Blocks Manually +------------------------------- -This device covers address range [0x100000000 ... 0x140000000) +If auto-onlining of memory blocks isn't enabled, user-space has to manually +trigger onlining of memory blocks. Often, udev rules are used to automate this +task in user space. -Under each memory block, you can see 5 files: +Onlining of a memory block can be triggered via:: -- ``/sys/devices/system/memory/memoryXXX/phys_index`` -- ``/sys/devices/system/memory/memoryXXX/phys_device`` -- ``/sys/devices/system/memory/memoryXXX/state`` -- ``/sys/devices/system/memory/memoryXXX/removable`` -- ``/sys/devices/system/memory/memoryXXX/valid_zones`` + % echo online > /sys/devices/system/memory/memoryXXX/state -=================== ============================================================ -``phys_index`` read-only and contains memory block id, same as XXX. -``state`` read-write +Or alternatively:: - - at read: contains online/offline state of memory. - - at write: user can specify "online_kernel", + % echo 1 > /sys/devices/system/memory/memoryXXX/online - "online_movable", "online", "offline" command - which will be performed on all sections in the block. -``phys_device`` read-only: legacy interface only ever used on s390x to - expose the covered storage increment. -``removable`` read-only: legacy interface that indicated whether a memory - block was likely to be offlineable or not. Newer kernel - versions return "1" if and only if the kernel supports - memory offlining. -``valid_zones`` read-only: designed to show by which zone memory provided by - a memory block is managed, and to show by which zone memory - provided by an offline memory block could be managed when - onlining. - - The first column shows it`s default zone. - - "memory6/valid_zones: Normal Movable" shows this memoryblock - can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE - by online_movable. - - "memory7/valid_zones: Movable Normal" shows this memoryblock - can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL - by online_kernel. -=================== ============================================================ +The kernel will select the target zone automatically, usually defaulting to +``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel +command line or if the memory block would intersect the ZONE_MOVABLE already. -.. note:: +One can explicitly request to associate an offline memory block with +ZONE_MOVABLE by:: - These directories/files appear after physical memory hotplug phase. + % echo online_movable > /sys/devices/system/memory/memoryXXX/state -If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed -via symbolic links located in the ``/sys/devices/system/node/node*`` directories. +Or one can explicitly request a kernel zone (usually ZONE_NORMAL) by:: -For example:: + % echo online_kernel > /sys/devices/system/memory/memoryXXX/state - /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 +In any case, if onlining succeeds, the state of the memory block is changed to +be "online". If it fails, the state of the memory block will remain unchanged +and the above commands will fail. -A backlink will also be created:: +Onlining Memory Blocks Automatically +------------------------------------ - /sys/devices/system/memory/memory9/node0 -> ../../node/node0 +The kernel can be configured to try auto-onlining of newly added memory blocks. +If this feature is disabled, the memory blocks will stay offline until +explicitly onlined from user space. -.. _memory_hotplug_physical_mem: +The configured auto-online behavior can be observed via:: -Physical memory hot-add phase -============================= + % cat /sys/devices/system/memory/auto_online_blocks -Hardware(Firmware) Support --------------------------- +Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or +``online_movable`` to that file, like:: -On x86_64/ia64 platform, memory hotplug by ACPI is supported. + % echo online > /sys/devices/system/memory/auto_online_blocks -In general, the firmware (ACPI) which supports memory hotplug defines -memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80, -Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev -script. This will be done automatically. +Modifying the auto-online behavior will only affect all subsequently added +memory blocks only. -But scripts for memory hotplug are not contained in generic udev package(now). -You may have to write it by yourself or online/offline memory by hand. -Please see :ref:`memory_hotplug_how_to_online_memory` and -:ref:`memory_hotplug_how_to_offline_memory`. +.. note:: -If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004", -"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler -calls hotplug code for all of objects which are defined in it. -If memory device is found, memory hotplug code will be called. + In corner cases, auto-onlining can fail. The kernel won't retry. Note that + auto-onlining is not expected to fail in default configurations. -Notify memory hot-add event by hand ------------------------------------ +.. note:: -On some architectures, the firmware may not notify the kernel of a memory -hotplug event. Therefore, the memory "probe" interface is supported to -explicitly notify the kernel. This interface depends on -CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86 -if hotplug is supported, although for x86 this should be handled by ACPI -notification. + DLPAR on ppc64 ignores the ``offline`` setting and will still online added + memory blocks; if onlining fails, memory blocks are removed again. -Probe interface is located at:: +Offlining Memory Blocks +----------------------- - /sys/devices/system/memory/probe +In the current implementation, Linux's memory offlining will try migrating all +movable pages off the affected memory block. As most kernel allocations, such as +page tables, are unmovable, page migration can fail and, therefore, inhibit +memory offlining from succeeding. -You can tell the physical address of new memory to the kernel by:: +Having the memory provided by memory block managed by ZONE_MOVABLE significantly +increases memory offlining reliability; still, memory offlining can fail in +some corner cases. - % echo start_address_of_new_memory > /sys/devices/system/memory/probe +Further, memory offlining might retry for a long time (or even forever), until +aborted by the user. -Then, [start_address_of_new_memory, start_address_of_new_memory + -memory_block_size] memory range is hot-added. In this case, hotplug script is -not called (in current implementation). You'll have to online memory by -yourself. Please see :ref:`memory_hotplug_how_to_online_memory`. +Offlining of a memory block can be triggered via:: -Logical Memory hot-add phase -============================ + % echo offline > /sys/devices/system/memory/memoryXXX/state -State of memory ---------------- +Or alternatively:: -To see (online/offline) state of a memory block, read 'state' file:: + % echo 0 > /sys/devices/system/memory/memoryXXX/online - % cat /sys/device/system/memory/memoryXXX/state +If offlining succeeds, the state of the memory block is changed to be "offline". +If it fails, the state of the memory block will remain unchanged and the above +commands will fail, for example, via:: + bash: echo: write error: Device or resource busy -- If the memory block is online, you'll read "online". -- If the memory block is offline, you'll read "offline". +or via:: + bash: echo: write error: Invalid argument -.. _memory_hotplug_how_to_online_memory: +Observing the State of Memory Blocks +------------------------------------ -How to online memory --------------------- +The state (online/offline/going-offline) of a memory block can be observed +either via:: -When the memory is hot-added, the kernel decides whether or not to "online" -it according to the policy which can be read from "auto_online_blocks" file:: + % cat /sys/device/system/memory/memoryXXX/state - % cat /sys/devices/system/memory/auto_online_blocks +Or alternatively (1/0) via:: -The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config -option. If it is disabled the default is "offline" which means the newly added -memory is not in a ready-to-use state and you have to "online" the newly added -memory blocks manually. Automatic onlining can be requested by writing "online" -to "auto_online_blocks" file:: + % cat /sys/device/system/memory/memoryXXX/online - % echo online > /sys/devices/system/memory/auto_online_blocks +For an online memory block, the managing zone can be observed via:: -This sets a global policy and impacts all memory blocks that will subsequently -be hotplugged. Currently offline blocks keep their state. It is possible, under -certain circumstances, that some memory blocks will be added but will fail to -online. User space tools can check their "state" files -(``/sys/devices/system/memory/memoryXXX/state``) and try to online them manually. + % cat /sys/device/system/memory/memoryXXX/valid_zones -If the automatic onlining wasn't requested, failed, or some memory block was -offlined it is possible to change the individual block's state by writing to the -"state" file:: +Configuring Memory Hot(Un)Plug +============================== - % echo online > /sys/devices/system/memory/memoryXXX/state +There are various ways how system administrators can configure memory +hot(un)plug and interact with memory blocks, especially, to online them. -This onlining will not change the ZONE type of the target memory block, -If the memory block doesn't belong to any zone an appropriate kernel zone -(usually ZONE_NORMAL) will be used unless movable_node kernel command line -option is specified when ZONE_MOVABLE will be used. +Memory Hot(Un)Plug Configuration via Sysfs +------------------------------------------ -You can explicitly request to associate it with ZONE_MOVABLE by:: +Some memory hot(un)plug properties can be configured or inspected via sysfs in:: - % echo online_movable > /sys/devices/system/memory/memoryXXX/state + /sys/devices/system/memory/ -.. note:: current limit: this memory block must be adjacent to ZONE_MOVABLE +The following files are currently defined: -Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by:: +====================== ========================================================= +``auto_online_blocks`` read-write: set or get the default state of new memory + blocks; configure auto-onlining. - % echo online_kernel > /sys/devices/system/memory/memoryXXX/state + The default value depends on the + CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration + option. -.. note:: current limit: this memory block must be adjacent to ZONE_NORMAL + See the ``state`` property of memory blocks for details. +``block_size_bytes`` read-only: the size in bytes of a memory block. +``probe`` write-only: add (probe) selected memory blocks manually + from user space by supplying the physical start address. -An explicit zone onlining can fail (e.g. when the range is already within -and existing and incompatible zone already). + Availability depends on the CONFIG_ARCH_MEMORY_PROBE + kernel configuration option. +``uevent`` read-write: generic udev file for device subsystems. +====================== ========================================================= -After this, memory block XXX's state will be 'online' and the amount of -available memory will be increased. +.. note:: -This may be changed in future. + When the CONFIG_MEMORY_FAILURE kernel configuration option is enabled, two + additional files ``hard_offline_page`` and ``soft_offline_page`` are available + to trigger hwpoisoning of pages, for example, for testing purposes. Note that + this functionality is not really related to memory hot(un)plug or actual + offlining of memory blocks. -Logical memory remove -===================== +Memory Block Configuration via Sysfs +------------------------------------ -Memory offline and ZONE_MOVABLE -------------------------------- +Each memory block is represented as a memory block device that can be +onlined or offlined. All memory blocks have their device information located in +sysfs. Each present memory block is listed under +``/sys/devices/system/memory`` as:: -Memory offlining is more complicated than memory online. Because memory offline -has to make the whole memory block be unused, memory offline can fail if -the memory block includes memory which cannot be freed. + /sys/devices/system/memory/memoryXXX -In general, memory offline can use 2 techniques. +where XXX is the memory block id; the number of digits is variable. -(1) reclaim and free all memory in the memory block. -(2) migrate all pages in the memory block. +A present memory block indicates that some memory in the range is present; +however, a memory block might span memory holes. A memory block spanning memory +holes cannot be offlined. -In the current implementation, Linux's memory offline uses method (2), freeing -all pages in the memory block by page migration. But not all pages are -migratable. Under current Linux, migratable pages are anonymous pages and -page caches. For offlining a memory block by migration, the kernel has to -guarantee that the memory block contains only migratable pages. +For example, assume 1 GiB memory block size. A device for a memory starting at +0x100000000 is ``/sys/device/system/memory/memory4``:: -Now, a boot option for making a memory block which consists of migratable pages -is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can -create ZONE_MOVABLE...a zone which is just used for movable pages. -(See also Documentation/admin-guide/kernel-parameters.rst) + (0x100000000 / 1Gib = 4) -Assume the system has "TOTAL" amount of memory at boot time, this boot option -creates ZONE_MOVABLE as following. +This device covers address range [0x100000000 ... 0x140000000) -1) When kernelcore=YYYY boot option is used, - Size of memory not for movable pages (not for offline) is YYYY. - Size of memory for movable pages (for offline) is TOTAL-YYYY. +The following files are currently defined: -2) When movablecore=ZZZZ boot option is used, - Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ. - Size of memory for movable pages (for offline) is ZZZZ. +=================== ============================================================ +``online`` read-write: simplified interface to trigger onlining / + offlining and to observe the state of a memory block. + When onlining, the zone is selected automatically. +``phys_device`` read-only: legacy interface only ever used on s390x to + expose the covered storage increment. +``phys_index`` read-only: the memory block id (XXX). +``removable`` read-only: legacy interface that indicated whether a memory + block was likely to be offlineable or not. Nowadays, the + kernel return ``1`` if and only if it supports memory + offlining. +``state`` read-write: advanced interface to trigger onlining / + offlining and to observe the state of a memory block. + + When writing, ``online``, ``offline``, ``online_kernel`` and + ``online_movable`` are supported. + + ``online_movable`` specifies onlining to ZONE_MOVABLE. + ``online_kernel`` specifies onlining to the default kernel + zone for the memory block, such as ZONE_NORMAL. + ``online`` let's the kernel select the zone automatically. + + When reading, ``online``, ``offline`` and ``going-offline`` + may be returned. +``uevent`` read-write: generic uevent file for devices. +``valid_zones`` read-only: when a block is online, shows the zone it + belongs to; when a block is offline, shows what zone will + manage it when the block will be onlined. + + For online memory blocks, ``DMA``, ``DMA32``, ``Normal``, + ``Movable`` and ``none`` may be returned. ``none`` indicates + that memory provided by a memory block is managed by + multiple zones or spans multiple nodes; such memory blocks + cannot be offlined. ``Movable`` indicates ZONE_MOVABLE. + Other values indicate a kernel zone. + + For offline memory blocks, the first column shows the + zone the kernel would select when onlining the memory block + right now without further specifying a zone. + + Availability depends on the CONFIG_MEMORY_HOTREMOVE + kernel configuration option. +=================== ============================================================ .. note:: - Unfortunately, there is no information to show which memory block belongs - to ZONE_MOVABLE. This is TBD. + If the CONFIG_NUMA kernel configuration option is enabled, the memoryXXX/ + directories can also be accessed via symbolic links located in the + ``/sys/devices/system/node/node*`` directories. + + For example:: + + /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 + + A backlink will also be created:: + + /sys/devices/system/memory/memory9/node0 -> ../../node/node0 + +Command Line Parameters +----------------------- + +Some command line parameters affect memory hot(un)plug handling. The following +command line parameters are relevant: + +======================== ======================================================= +``memhp_default_state`` configure auto-onlining by essentially setting + ``/sys/devices/system/memory/auto_online_blocks``. +``movablecore`` configure automatic zone selection of the kernel. When + set, the kernel will default to ZONE_MOVABLE, unless + other zones can be kept contiguous. +======================== ======================================================= + +Module Parameters +------------------ - Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE - and the feature of freeing unused vmemmap pages associated with each hugetlb - page is enabled. +Instead of additional command line parameters or sysfs files, the +``memory_hotplug`` subsystem now provides a dedicated namespace for module +parameters. Module parameters can be set via the command line by predicating +them with ``memory_hotplug.`` such as:: + + memory_hotplug.memmap_on_memory=1 + +and they can be observed (and some even modified at runtime) via:: + + /sys/modules/memory_hotplug/parameters/ + +The following module parameters are currently defined: + +======================== ======================================================= +``memmap_on_memory`` read-write: Allocate memory for the memmap from the + added memory block itself. Even if enabled, actual + support depends on various other system properties and + should only be regarded as a hint whether the behavior + would be desired. + + While allocating the memmap from the memory block + itself makes memory hotplug less likely to fail and + keeps the memmap on the same NUMA node in any case, it + can fragment physical memory in a way that huge pages + in bigger granularity cannot be formed on hotplugged + memory. +======================== ======================================================= + +ZONE_MOVABLE +============ + +ZONE_MOVABLE is an important mechanism for more reliable memory offlining. +Further, having system RAM managed by ZONE_MOVABLE instead of one of the +kernel zones can increase the number of possible transparent huge pages and +dynamically allocated huge pages. + +Most kernel allocations are unmovable. Important examples include the memory +map (usually 1/64ths of memory), page tables, and kmalloc(). Such allocations +can only be served from the kernel zones. + +Most user space pages, such as anonymous memory, and page cache pages are +movable. Such allocations can be served from ZONE_MOVABLE and the kernel zones. + +Only movable allocations are served from ZONE_MOVABLE, resulting in unmovable +allocations being limited to the kernel zones. Without ZONE_MOVABLE, there is +absolutely no guarantee whether a memory block can be offlined successfully. + +Zone Imbalances +--------------- - This can happen when we have plenty of ZONE_MOVABLE memory, but not enough - kernel memory to allocate vmemmmap pages. We may even be able to migrate - huge page contents, but will not be able to dissolve the source huge page. - This will prevent an offline operation and is unfortunate as memory offlining - is expected to succeed on movable zones. Users that depend on memory hotplug - to succeed for movable zones should carefully consider whether the memory - savings gained from this feature are worth the risk of possibly not being - able to offline memory in certain situations. +Having too much system RAM managed by ZONE_MOVABLE is called a zone imbalance, +which can harm the system or degrade performance. As one example, the kernel +might crash because it runs out of free memory for unmovable allocations, +although there is still plenty of free memory left in ZONE_MOVABLE. + +Usually, MOVABLE:KERNEL ratios of up to 3:1 or even 4:1 are fine. Ratios of 63:1 +are definitely impossible due to the overhead for the memory map. + +Actual safe zone ratios depend on the workload. Extreme cases, like excessive +long-term pinning of pages, might not be able to deal with ZONE_MOVABLE at all. .. note:: - Techniques that rely on long-term pinnings of memory (especially, RDMA and - vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory - hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that - memory can still get hot removed - be aware that pinning can fail even if - there is plenty of free memory in ZONE_MOVABLE. In addition, using - ZONE_MOVABLE might make page pinning more expensive, because pages have to be - migrated off that zone first. -.. _memory_hotplug_how_to_offline_memory: + CMA memory part of a kernel zone essentially behaves like memory in + ZONE_MOVABLE and similar considerations apply, especially when combining + CMA with ZONE_MOVABLE. -How to offline memory ---------------------- +ZONE_MOVABLE Sizing Considerations +---------------------------------- -You can offline a memory block by using the same sysfs interface that was used -in memory onlining:: +We usually expect that a large portion of available system RAM will actually +be consumed by user space, either directly or indirectly via the page cache. In +the normal case, ZONE_MOVABLE can be used when allocating such pages just fine. - % echo offline > /sys/devices/system/memory/memoryXXX/state +With that in mind, it makes sense that we can have a big portion of system RAM +managed by ZONE_MOVABLE. However, there are some things to consider when using +ZONE_MOVABLE, especially when fine-tuning zone ratios: + +- Having a lot of offline memory blocks. Even offline memory blocks consume + memory for metadata and page tables in the direct map; having a lot of offline + memory blocks is not a typical case, though. + +- Memory ballooning without balloon compaction is incompatible with + ZONE_MOVABLE. Only some implementations, such as virtio-balloon and + pseries CMM, fully support balloon compaction. + + Further, the CONFIG_BALLOON_COMPACTION kernel configuration option might be + disabled. In that case, balloon inflation will only perform unmovable + allocations and silently create a zone imbalance, usually triggered by + inflation requests from the hypervisor. + +- Gigantic pages are unmovable, resulting in user space consuming a + lot of unmovable memory. + +- Huge pages are unmovable when an architectures does not support huge + page migration, resulting in a similar issue as with gigantic pages. + +- Page tables are unmovable. Excessive swapping, mapping extremely large + files or ZONE_DEVICE memory can be problematic, although only really relevant + in corner cases. When we manage a lot of user space memory that has been + swapped out or is served from a file/persistent memory/... we still need a lot + of page tables to manage that memory once user space accessed that memory. + +- In certain DAX configurations the memory map for the device memory will be + allocated from the kernel zones. + +- KASAN can have a significant memory overhead, for example, consuming 1/8th of + the total system memory size as (unmovable) tracking metadata. + +- Long-term pinning of pages. Techniques that rely on long-term pinnings + (especially, RDMA and vfio/mdev) are fundamentally problematic with + ZONE_MOVABLE, and therefore, memory offlining. Pinned pages cannot reside + on ZONE_MOVABLE as that would turn these pages unmovable. Therefore, they + have to be migrated off that zone while pinning. Pinning a page can fail + even if there is plenty of free memory in ZONE_MOVABLE. + + In addition, using ZONE_MOVABLE might make page pinning more expensive, + because of the page migration overhead. + +By default, all the memory configured at boot time is managed by the kernel +zones and ZONE_MOVABLE is not used. + +To enable ZONE_MOVABLE to include the memory present at boot and to control the +ratio between movable and kernel zones there are two command line options: +``kernelcore=`` and ``movablecore=``. See +Documentation/admin-guide/kernel-parameters.rst for their description. + +Memory Offlining and ZONE_MOVABLE +--------------------------------- + +Even with ZONE_MOVABLE, there are some corner cases where offlining a memory +block might fail: + +- Memory blocks with memory holes; this applies to memory blocks present during + boot and can apply to memory blocks hotplugged via the XEN balloon and the + Hyper-V balloon. + +- Mixed NUMA nodes and mixed zones within a single memory block prevent memory + offlining; this applies to memory blocks present during boot only. + +- Special memory blocks prevented by the system from getting offlined. Examples + include any memory available during boot on arm64 or memory blocks spanning + the crashkernel area on s390x; this usually applies to memory blocks present + during boot only. + +- Memory blocks overlapping with CMA areas cannot be offlined, this applies to + memory blocks present during boot only. + +- Concurrent activity that operates on the same physical memory area, such as + allocating gigantic pages, can result in temporary offlining failures. + +- Out of memory when dissolving huge pages, especially when freeing unused + vmemmap pages associated with each hugetlb page is enabled. + + Offlining code may be able to migrate huge page contents, but may not be able + to dissolve the source huge page because it fails allocating (unmovable) pages + for the vmemmap, because the system might not have free memory in the kernel + zones left. + + Users that depend on memory offlining to succeed for movable zones should + carefully consider whether the memory savings gained from this feature are + worth the risk of possibly not being able to offline memory in certain + situations. + +Further, when running into out of memory situations while migrating pages, or +when still encountering permanently unmovable pages within ZONE_MOVABLE +(-> BUG), memory offlining will keep retrying until it eventually succeeds. + +When offlining is triggered from user space, the offlining context can be +terminated by sending a fatal signal. A timeout based offlining can easily be +implemented via:: -If offline succeeds, the state of the memory block is changed to be "offline". -If it fails, some error core (like -EBUSY) will be returned by the kernel. -Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline -it. If it doesn't contain 'unmovable' memory, you'll get success. - -A memory block under ZONE_MOVABLE is considered to be able to be offlined -easily. But under some busy state, it may return -EBUSY. Even if a memory -block cannot be offlined due to -EBUSY, you can retry offlining it and may be -able to offline it (or not). (For example, a page is referred to by some kernel -internal call and released soon.) - -Consideration: - Memory hotplug's design direction is to make the possibility of memory - offlining higher and to guarantee unplugging memory under any situation. But - it needs more work. Returning -EBUSY under some situation may be good because - the user can decide to retry more or not by himself. Currently, memory - offlining code does some amount of retry with 120 seconds timeout. - -Physical memory remove -====================== - -Need more implementation yet.... - - Notification completion of remove works by OS to firmware. - - Guard from remove if not yet. - - -Future Work -=========== - - - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like - sysctl or new control file. - - showing memory block and physical device relationship. - - test and make it better memory offlining. - - support HugeTLB page migration and offlining. - - memmap removing at memory offline. - - physical remove memory. + % timeout $TIMEOUT offline_block | failure_handling From 76560aa61988a559c612d615fdc2fe87d00dbe87 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:40 +1000 Subject: [PATCH 242/365] mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE Patch series "mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE". After recent updates to freeing unused parts of the memory map, no architecture can have holes in the memory map within a pageblock. This makes pfn_valid_within() check and CONFIG_HOLES_IN_ZONE configuration option redundant. The first patch removes them both in a mechanical way and the second patch simplifies memory_hotplug::test_pages_in_a_zone() that had pfn_valid_within() surrounded by more logic than simple if. This patch (of 2): After recent changes in freeing of the unused parts of the memory map and rework of pfn_valid() in arm and arm64 there are no architectures that can have holes in the memory map within a pageblock and so nothing can enable CONFIG_HOLES_IN_ZONE which guards non trivial implementation of pfn_valid_within(). With that, pfn_valid_within() is always hardwired to 1 and can be completely removed. Remove calls to pfn_valid_within() and CONFIG_HOLES_IN_ZONE. Link: https://lkml.kernel.org/r/20210713080035.7464-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210713080035.7464-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/node.c | 2 -- include/linux/mmzone.h | 12 ------------ mm/Kconfig | 3 --- mm/compaction.c | 20 +++++++------------- mm/memory_hotplug.c | 4 ---- mm/page_alloc.c | 24 ++---------------------- mm/page_isolation.c | 7 +------ mm/page_owner.c | 14 +------------- 8 files changed, 11 insertions(+), 75 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 4a4ae868ad9f8..8ec6b7dfbb0f9 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -768,8 +768,6 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static int __ref get_nid_for_pfn(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1bd5f5955f9a7..50c515a529963 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1525,18 +1525,6 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -/* - * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validity within that MAX_ORDER_NR_PAGES block. - * pfn_valid_within() should be used in this case; we optimise this away - * when we have no holes within a MAX_ORDER_NR_PAGES block. - */ -#ifdef CONFIG_HOLES_IN_ZONE -#define pfn_valid_within(pfn) pfn_valid(pfn) -#else -#define pfn_valid_within(pfn) (1) -#endif - #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 5dc28e9205e00..1f9bd3371765b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -96,9 +96,6 @@ config HAVE_FAST_GUP depends on MMU bool -config HOLES_IN_ZONE - bool - # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. diff --git a/mm/compaction.c b/mm/compaction.c index fa9b2b598eabe..bfc93da1c2c7c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -306,16 +306,14 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, * is necessary for the block to be a migration source/target. */ do { - if (pfn_valid_within(pfn)) { - if (check_source && PageLRU(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } - if (check_target && PageBuddy(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; } page += (1 << PAGE_ALLOC_COSTLY_ORDER); @@ -585,8 +583,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; nr_scanned++; - if (!pfn_valid_within(blockpfn)) - goto isolate_fail; /* * For compound pages such as THP and hugetlbfs, we can save @@ -885,8 +881,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, cond_resched(); } - if (!pfn_valid_within(low_pfn)) - goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4c527a80b6c99..02718a50ef543 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1308,10 +1308,6 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, for (; pfn < sec_end_pfn && pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && - !pfn_valid_within(pfn + i)) - i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; /* Check if we got outside of the zone */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f95e1d2386a13..9766f2ffaebeb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -594,8 +594,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) static int page_is_consistent(struct zone *zone, struct page *page) { - if (!pfn_valid_within(page_to_pfn(page))) - return 0; if (zone != page_zone(page)) return 0; @@ -1025,16 +1023,12 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, if (order >= MAX_ORDER - 2) return false; - if (!pfn_valid_within(buddy_pfn)) - return false; - combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); - return pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1); + return page_is_buddy(higher_page, higher_buddy, order + 1); } /* @@ -1095,8 +1089,6 @@ static inline void __free_one_page(struct page *page, buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (!pfn_valid_within(buddy_pfn)) - goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -1754,9 +1746,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. + * with the migration of free compaction scanner. * * Return struct page pointer of start_pfn, or NULL if checks were not passed. * @@ -1872,8 +1862,6 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; return true; @@ -2520,11 +2508,6 @@ static int move_freepages(struct zone *zone, int pages_moved = 0; for (pfn = start_pfn; pfn <= end_pfn;) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } - page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* @@ -8828,9 +8811,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, } for (; iter < pageblock_nr_pages - offset; iter++) { - if (!pfn_valid_within(pfn + iter)) - continue; - page = pfn_to_page(pfn + iter); /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bddf788f45bff..471e3a13b5411 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -93,8 +93,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(buddy_pfn) && - !is_migrate_isolate_page(buddy)) { + if (!is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; } @@ -250,10 +249,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, struct page *page; while (pfn < end_pfn) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); if (PageBuddy(page)) /* diff --git a/mm/page_owner.c b/mm/page_owner.c index f51a57e92aa38..62402d22539b8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -276,9 +276,6 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); @@ -479,10 +476,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - /* Check for holes within a MAX_ORDER area */ - if (!pfn_valid_within(pfn)) - continue; - page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); @@ -560,14 +553,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { - struct page *page; + struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - if (page_zone(page) != zone) continue; From 1bc61f2d39cd7509ad27cc605f13890712abc07a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 24 Aug 2021 09:59:40 +1000 Subject: [PATCH 243/365] mm: memory_hotplug: cleanup after removal of pfn_valid_within() When test_pages_in_a_zone() used pfn_valid_within() is has some logic surrounding pfn_valid_within() checks. Since pfn_valid_within() is gone, this logic can be removed. Link: https://lkml.kernel.org/r/20210713080035.7464-3-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 02718a50ef543..c26195d3803c3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1298,7 +1298,7 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long pfn, sec_end_pfn; struct zone *zone = NULL; struct page *page; - int i; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { @@ -1307,13 +1307,10 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, continue; for (; pfn < sec_end_pfn && pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { - i = 0; - if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) - continue; /* Check if we got outside of the zone */ - if (zone && !zone_spans_pfn(zone, pfn + i)) + if (zone && !zone_spans_pfn(zone, pfn)) return NULL; - page = pfn_to_page(pfn + i); + page = pfn_to_page(pfn); if (zone && page_zone(page) != zone) return NULL; zone = page_zone(page); From 8c1cf499197a292c23100e8a243cd5aedd8b1b06 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:40 +1000 Subject: [PATCH 244/365] mm/memory_hotplug: use "unsigned long" for PFN in zone_for_pfn_range() Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory" These are all cleanups and one fix previously sent as part of [1]: [PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory groups. These patches make sense even without the other series, therefore I pulled them out to make the other series easier to digest. [1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com This patch (of 4): Checkpatch complained on a follow-up patch that we are using "unsigned" here, which defaults to "unsigned int" and checkpatch is correct. As we will search for a fitting zone using the wrong pfn, we might end up onlining memory to one of the special kernel zones, such as ZONE_DMA, which can end badly as the onlined memory does not satisfy properties of these zones. Use "unsigned long" instead, just as we do in other places when handling PFNs. This can bite us once we have physical addresses in the range of multiple TB. Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering") Signed-off-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Heiko Carstens Cc: Michael Ellerman Cc: Catalin Marinas Cc: virtualization@lists.linux-foundation.org Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Jiang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Laurent Dufour Cc: Michel Lespinasse Cc: Nathan Lynch Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Scott Cheloha Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Will Deacon Cc: Yoshinori Sato Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a7fd2c3ccb777..d01b504ce06fe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -339,8 +339,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages); +extern struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c26195d3803c3..dd921820c4d09 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -708,8 +708,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages) +struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); From 7eac001b264ccce9e63df1711ed21236dd5ae9b3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:41 +1000 Subject: [PATCH 245/365] mm/memory_hotplug: remove nid parameter from arch_remove_memory() The parameter is unused, let's remove it. Link: https://lkml.kernel.org/r/20210712124052.26491-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas Acked-by: Michael Ellerman [powerpc] Acked-by: Heiko Carstens [s390] Reviewed-by: Pankaj Gupta Reviewed-by: Oscar Salvador Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Pavel Tatashin Cc: Baoquan He Cc: Laurent Dufour Cc: Sergei Trofimovich Cc: Kefeng Wang Cc: Michel Lespinasse Cc: Christophe Leroy Cc: "Aneesh Kumar K.V" Cc: Thiago Jung Bauermann Cc: Joe Perches Cc: Pierre Morel Cc: Jia He Cc: Anton Blanchard Cc: Dan Williams Cc: Dave Jiang Cc: Jason Wang Cc: Len Brown Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Nathan Lynch Cc: Pankaj Gupta Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Scott Cheloha Cc: Vishal Verma Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/mm/mmu.c | 3 +-- arch/ia64/mm/init.c | 3 +-- arch/powerpc/mm/mem.c | 3 +-- arch/s390/mm/init.c | 3 +-- arch/sh/mm/init.c | 3 +-- arch/x86/mm/init_32.c | 3 +-- arch/x86/mm/init_64.c | 3 +-- include/linux/memory_hotplug.h | 3 +-- mm/memory_hotplug.c | 4 ++-- mm/memremap.c | 5 +---- 10 files changed, 11 insertions(+), 22 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9ff0de1b2b93c..cfd9deb347c38 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1502,8 +1502,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 064a967a7b6e3..5c6da8d83c1ad 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ad198b4392224..c3c4e31462eca 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return rc; } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8ac710de1ab1b..d85bd7f5d8dc6 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -306,8 +306,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index ce26c7f8950a3..506784702430c 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 74b78840182df..bd90b8fe81e45 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, params); } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ddeaba947eb3d..a6e11763763fb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d01b504ce06fe..010a192298b53 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -130,8 +130,7 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -extern void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap); +extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dd921820c4d09..396d913b6d2b3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1106,7 +1106,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(start, size, NULL); goto error; } @@ -1886,7 +1886,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, altmap); + arch_remove_memory(start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/memremap.c b/mm/memremap.c index 805d761740c42..e99944ab8eb3c 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -160,14 +160,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - int nid; /* make sure to access a memmap that was actually initialized */ first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(first_page); - mem_hotplug_begin(); remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), PHYS_PFN(range_len(range))); @@ -175,7 +172,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) __remove_pages(PHYS_PFN(range->start), PHYS_PFN(range_len(range)), NULL); } else { - arch_remove_memory(nid, range->start, range_len(range), + arch_remove_memory(range->start, range_len(range), pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(range->start), range_len(range)); } From 0d5077ceae1d41da3eb609e13c4118d108208c14 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:41 +1000 Subject: [PATCH 246/365] mm/memory_hotplug: remove nid parameter from remove_memory() and friends There is only a single user remaining. We can simply lookup the nid only used for node offlining purposes when walking our memory blocks. We don't expect to remove multi-nid ranges; and if we'd ever do, we most probably don't care about removing multi-nid ranges that actually result in empty nodes. If ever required, we can detect the "multi-nid" scenario and simply try offlining all online nodes. Link: https://lkml.kernel.org/r/20210712124052.26491-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael Ellerman (powerpc) Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Nathan Lynch Cc: Laurent Dufour Cc: "Aneesh Kumar K.V" Cc: Scott Cheloha Cc: Anton Blanchard Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Hansen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Michal Hocko Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../platforms/pseries/hotplug-memory.c | 9 +++--- drivers/acpi/acpi_memhotplug.c | 7 +---- drivers/dax/kmem.c | 3 +- drivers/virtio/virtio_mem.c | 4 +-- include/linux/memory_hotplug.h | 10 +++---- mm/memory_hotplug.c | 28 +++++++++++-------- 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index d4f28ee4d5dce..533cb1335b7f2 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -284,7 +284,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si { unsigned long block_sz, start_pfn; int sections_per_block; - int i, nid; + int i; start_pfn = base >> PAGE_SHIFT; @@ -295,10 +295,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si block_sz = pseries_memory_block_size(); sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - nid = memory_add_physaddr_to_nid(base); for (i = 0; i < sections_per_block; i++) { - __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + __remove_memory(base, MIN_MEMORY_BLOCK_SIZE); base += MIN_MEMORY_BLOCK_SIZE; } @@ -385,7 +384,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) block_sz = pseries_memory_block_size(); - __remove_memory(mem_block->nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); put_device(&mem_block->dev); /* Update memory regions for memory remove */ @@ -658,7 +657,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 8cc195c4c8619..1d01d9414c407 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -239,19 +239,14 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device) { - acpi_handle handle = mem_device->device->handle; struct acpi_memory_info *info, *n; - int nid = acpi_get_node(handle); list_for_each_entry_safe(info, n, &mem_device->res_list, list) { if (!info->enabled) continue; - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(info->start_addr); - acpi_unbind_memory_blocks(info); - __remove_memory(nid, info->start_addr, info->length); + __remove_memory(info->start_addr, info->length); list_del(&info->list); kfree(info); } diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index ac231cc363595..99e0f60c4c266 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -156,8 +156,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) if (rc) continue; - rc = remove_memory(dev_dax->target_node, range.start, - range_len(&range)); + rc = remove_memory(range.start, range_len(&range)); if (rc == 0) { release_resource(data->res[i]); kfree(data->res[i]); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 09ed55de07d7d..774986695dc48 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -677,7 +677,7 @@ static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, addr + size - 1); - rc = remove_memory(vm->nid, addr, size); + rc = remove_memory(addr, size); if (!rc) { atomic64_sub(size, &vm->offline_size); /* @@ -720,7 +720,7 @@ static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, "offlining and removing memory: 0x%llx - 0x%llx\n", addr, addr + size - 1); - rc = offline_and_remove_memory(vm->nid, addr, size); + rc = offline_and_remove_memory(addr, size); if (!rc) { atomic64_sub(size, &vm->offline_size); /* diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010a192298b53..068e3dcf46904 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -292,9 +292,9 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); -extern int remove_memory(int nid, u64 start, u64 size); -extern void __remove_memory(int nid, u64 start, u64 size); -extern int offline_and_remove_memory(int nid, u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern void __remove_memory(u64 start, u64 size); +extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} @@ -304,12 +304,12 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) return -EINVAL; } -static inline int remove_memory(int nid, u64 start, u64 size) +static inline int remove_memory(u64 start, u64 size) { return -EBUSY; } -static inline void __remove_memory(int nid, u64 start, u64 size) {} +static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ extern void set_zone_contiguous(struct zone *zone); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 396d913b6d2b3..e14a9bb9592cb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1739,7 +1739,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); + int *nid = arg; + *nid = mem->nid; if (unlikely(ret)) { phys_addr_t beginpa, endpa; @@ -1832,12 +1834,12 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); -static int __ref try_remove_memory(int nid, u64 start, u64 size) +static int __ref try_remove_memory(u64 start, u64 size) { - int rc = 0; struct vmem_altmap mhp_altmap = {}; struct vmem_altmap *altmap = NULL; unsigned long nr_vmemmap_pages; + int rc = 0, nid = NUMA_NO_NODE; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1845,8 +1847,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and return error * if this is not the case. + * + * While at it, determine the nid. Note that if we'd have mixed nodes, + * we'd only try to offline the last determined one -- which is good + * enough for the cases we care about. */ - rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); + rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); if (rc) return rc; @@ -1895,7 +1901,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) release_mem_region_adjustable(start, size); - try_offline_node(nid); + if (nid != NUMA_NO_NODE) + try_offline_node(nid); mem_hotplug_done(); return 0; @@ -1903,7 +1910,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) /** * __remove_memory - Remove memory if every memory block is offline - * @nid: the node ID * @start: physical address of the region to remove * @size: size of the region to remove * @@ -1911,14 +1917,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * and online/offline operations before this call, as required by * try_offline_node(). */ -void __remove_memory(int nid, u64 start, u64 size) +void __remove_memory(u64 start, u64 size) { /* * trigger BUG() if some memory is not offlined prior to calling this * function */ - if (try_remove_memory(nid, start, size)) + if (try_remove_memory(start, size)) BUG(); } @@ -1926,12 +1932,12 @@ void __remove_memory(int nid, u64 start, u64 size) * Remove memory if every memory block is offline, otherwise return -EBUSY is * some memory is not offline */ -int remove_memory(int nid, u64 start, u64 size) +int remove_memory(u64 start, u64 size) { int rc; lock_device_hotplug(); - rc = try_remove_memory(nid, start, size); + rc = try_remove_memory(start, size); unlock_device_hotplug(); return rc; @@ -1991,7 +1997,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg) * unplugged all memory (so it's no longer in use) and want to offline + remove * that memory. */ -int offline_and_remove_memory(int nid, u64 start, u64 size) +int offline_and_remove_memory(u64 start, u64 size) { const unsigned long mb_count = size / memory_block_size_bytes(); uint8_t *online_types, *tmp; @@ -2027,7 +2033,7 @@ int offline_and_remove_memory(int nid, u64 start, u64 size) * This cannot fail as it cannot get onlined in the meantime. */ if (!rc) { - rc = try_remove_memory(nid, start, size); + rc = try_remove_memory(start, size); if (rc) pr_err("%s: Failed to remove memory: %d", __func__, rc); } From d028d3cbb08e37bb8159baf3ab932304476ee05e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:41 +1000 Subject: [PATCH 247/365] ACPI: memhotplug: memory resources cannot be enabled yet We allocate + initialize everything from scratch. In case enabling the device fails, we free all memory resourcs. Link: https://lkml.kernel.org/r/20210712124052.26491-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Rafael J. Wysocki Reviewed-by: Oscar Salvador Reviewed-by: Pankaj Gupta Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Laurent Dufour Cc: Len Brown Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nathan Lynch Cc: Nicholas Piggin Cc: Pankaj Gupta Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Scott Cheloha Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/acpi/acpi_memhotplug.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 1d01d9414c407..eb4faf7c5cad0 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -182,10 +182,6 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) * (i.e. memory-hot-remove function) */ list_for_each_entry(info, &mem_device->res_list, list) { - if (info->enabled) { /* just sanity check...*/ - num_enabled++; - continue; - } /* * If the memory block size is zero, please ignore it. * Don't try to do the following memory hotplug flowchart. From 7875f7cd8acbce107b7f1b51a53ea4dfc47b13c1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:41 +1000 Subject: [PATCH 248/365] mm: track present early pages per zone Patch series "mm/memory_hotplug: "auto-movable" online policy and memory groups", v3. I. Goal The goal of this series is improving in-kernel auto-online support. It tackles the fundamental problems that: 1) We can create zone imbalances when onlining all memory blindly to ZONE_MOVABLE, in the worst case crashing the system. We have to know upfront how much memory we are going to hotplug such that we can safely enable auto-onlining of all hotplugged memory to ZONE_MOVABLE via "online_movable". This is far from practical and only applicable in limited setups -- like inside VMs under the RHV/oVirt hypervisor which will never hotplug more than 3 times the boot memory (and the limitation is only in place due to the Linux limitation). 2) We see more setups that implement dynamic VM resizing, hot(un)plugging memory to resize VM memory. In these setups, we might hotplug a lot of memory, but it might happen in various small steps in both directions (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...). virtio-mem is the primary driver of this upstream right now, performing such dynamic resizing NUMA-aware via multiple virtio-mem devices. Onlining all hotplugged memory to ZONE_NORMAL means we basically have no hotunplug guarantees. Onlining all to ZONE_MOVABLE means we can easily run into zone imbalances when growing a VM. We want a mixture, and we want as much memory as reasonable/configured in ZONE_MOVABLE. Details regarding zone imbalances can be found at [1]. 3) Memory devices consist of 1..X memory block devices, however, the kernel doesn't really track the relationship. Consequently, also user space has no idea. We want to make per-device decisions. As one example, for memory hotunplug it doesn't make sense to use a mixture of zones within a single DIMM: we want all MOVABLE if possible, otherwise all !MOVABLE, because any !MOVABLE part will easily block the whole DIMM from getting hotunplugged. As another example, virtio-mem operates on individual units that span 1..X memory blocks. Similar to a DIMM, we want a unit to either be all MOVABLE or !MOVABLE. A "unit" can be thought of like a DIMM, however, all units of a virtio-mem device logically belong together and are managed (added/removed) by a single driver. We want as much memory of a virtio-mem device to be MOVABLE as possible. 4) We want memory onlining to be done right from the kernel while adding memory, not triggered by user space via udev rules; for example, this is reqired for fast memory hotplug for drivers that add individual memory blocks, like virito-mem. We want a way to configure a policy in the kernel and avoid implementing advanced policies in user space. The auto-onlining support we have in the kernel is not sufficient. All we have is a) online everything MOVABLE (online_movable) b) online everything !MOVABLE (online_kernel) c) keep zones contiguous (online). This series allows configuring c) to mean instead "online movable if possible according to the coniguration, driven by a maximum MOVABLE:KERNEL ratio" -- a new onlining policy. II. Approach This series does 3 things: 1) Introduces the "auto-movable" online policy that initially operates on individual memory blocks only. It uses a maximum MOVABLE:KERNEL ratio to make a decision whether a memory block will be onlined to ZONE_MOVABLE or not. However, in the basic form, hotplugged KERNEL memory does not allow for more MOVABLE memory (details in the patches). CMA memory is treated like MOVABLE memory. 2) Introduces static (e.g., DIMM) and dynamic (e.g., virtio-mem) memory groups and uses group information to make decisions in the "auto-movable" online policy across memory blocks of a single memory device (modeled as memory group). More details can be found in patch #3 or in the DIMM example below. 3) Maximizes ZONE_MOVABLE memory within dynamic memory groups, by allowing ZONE_NORMAL memory within a dynamic memory group to allow for more ZONE_MOVABLE memory within the same memory group. The target use case is dynamic VM resizing using virtio-mem. See the virtio-mem example below. I remember that the basic idea of using a ratio to implement a policy in the kernel was once mentioned by Vitaly Kuznetsov, but I might be wrong (I lost the pointer to that discussion). For me, the main use case is using it along with virtio-mem (and DIMMs / ppc64 dlpar where necessary) for dynamic resizing of VMs, increasing the amount of memory we can hotunplug reliably again if we might eventually hotplug a lot of memory to a VM. III. Target Usage The target usage will be: 1) Linux boots with "mhp_default_online_type=offline" 2) User space (e.g., systemd unit) configures memory onlining (according to a config file and system properties), for example: * Setting memory_hotplug.online_policy=auto-movable * Setting memory_hotplug.auto_movable_ratio=301 * Setting memory_hotplug.auto_movable_numa_aware=true 3) User space enabled auto onlining via "echo online > /sys/devices/system/memory/auto_online_blocks" 4) User space triggers manual onlining of all already-offline memory blocks (go over offline memory blocks and set them to "online") IV. Example For DIMMs, hotplugging 4 GiB DIMMs to a 4 GiB VM with a configured ratio of 301% results in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-79: Movable (DIMM 0) Memory block 80-111: Movable (DIMM 1) Memory block 112-143: Movable (DIMM 2) Memory block 144-275: Normal (DIMM 3) Memory block 176-207: Normal (DIMM 4) ... all Normal (-> hotplugged Normal memory does not allow for more Movable memory) For virtio-mem, using a simple, single virtio-mem device with a 4 GiB VM will result in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-143: Movable (virtio-mem, first 12 GiB) Memory block 144: Normal (virtio-mem, next 128 MiB) Memory block 145-147: Movable (virtio-mem, next 384 MiB) Memory block 148: Normal (virtio-mem, next 128 MiB) Memory block 149-151: Movable (virtio-mem, next 384 MiB) ... Normal/Movable mixture as above (-> hotplugged Normal memory allows for more Movable memory within the same device) Which gives us maximum flexibility when dynamically growing/shrinking a VM in smaller steps. V. Doc Update I'll update the memory-hotplug.rst documentation, once the overhaul [1] is usptream. Until then, details can be found in patch #2. VI. Future Work 1) Use memory groups for ppc64 dlpar 2) Being able to specify a portion of (early) kernel memory that will be excluded from the ratio. Like "128 MiB globally/per node" are excluded. This might be helpful when starting VMs with extremely small memory footprint (e.g., 128 MiB) and hotplugging memory later -- not wanting the first hotplugged units getting onlined to ZONE_MOVABLE. One alternative would be a trigger to not consider ZONE_DMA memory in the ratio. We'll have to see if this is really rrequired. 3) Indicate to user space that MOVABLE might be a bad idea -- especially relevant when memory ballooning without support for balloon compaction is active. This patch (of 9): For implementing a new memory onlining policy, which determines when to online memory blocks to ZONE_MOVABLE semi-automatically, we need the number of present early (boot) pages -- present pages excluding hotplugged pages. Let's track these pages per zone. Pass a page instead of the zone to adjust_present_page_count(), similar as adjust_managed_page_count() and derive the zone from the page. It's worth noting that a memory block to be offlined/onlined is either completely "early" or "not early". add_memory() and friends can only add complete memory blocks and we only online/offline complete (individual) memory blocks. Link: https://lkml.kernel.org/r/20210806124715.17090-1-david@redhat.com Link: https://lkml.kernel.org/r/20210806124715.17090-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Marek Kedzierski Cc: Hui Zhu Cc: Pankaj Gupta Cc: Wei Yang Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 14 +++++++------- include/linux/memory_hotplug.h | 2 +- include/linux/mmzone.h | 7 +++++++ mm/memory_hotplug.c | 14 +++++++++++--- mm/page_alloc.c | 3 +++ 5 files changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e3fd2dbf4eea8..b8a596a5f07c5 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -205,7 +205,8 @@ static int memory_block_online(struct memory_block *mem) * now already properly populated. */ if (nr_vmemmap_pages) - adjust_present_page_count(zone, nr_vmemmap_pages); + adjust_present_page_count(pfn_to_page(start_pfn), + nr_vmemmap_pages); return ret; } @@ -215,24 +216,23 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; - struct zone *zone; int ret; /* * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ - if (nr_vmemmap_pages) { - zone = page_zone(pfn_to_page(start_pfn)); - adjust_present_page_count(zone, -nr_vmemmap_pages); - } + if (nr_vmemmap_pages) + adjust_present_page_count(pfn_to_page(start_pfn), + -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, nr_pages - nr_vmemmap_pages); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) - adjust_present_page_count(zone, nr_vmemmap_pages); + adjust_present_page_count(pfn_to_page(start_pfn), + nr_vmemmap_pages); return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 068e3dcf46904..39b04e99a30eb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -95,7 +95,7 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct zone *zone, long nr_pages); +extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 50c515a529963..6a1d79d84675a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -540,6 +540,10 @@ struct zone { * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * + * present_early_pages is present pages existing within the zone + * located on memory available since early boot, excluding hotplugged + * memory. + * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): @@ -572,6 +576,9 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; +#if defined(CONFIG_MEMORY_HOTPLUG) + unsigned long present_early_pages; +#endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e14a9bb9592cb..41a2a3977aac9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -724,8 +724,16 @@ struct zone *zone_for_pfn_range(int online_type, int nid, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct zone *zone, long nr_pages) +void adjust_present_page_count(struct page *page, long nr_pages) { + struct zone *zone = page_zone(page); + + /* + * We only support onlining/offlining/adding/removing of complete + * memory blocks; therefore, either all is either early or hotplugged. + */ + if (early_section(__pfn_to_section(page_to_pfn(page)))) + zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; } @@ -826,7 +834,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(zone, nr_pages); + adjust_present_page_count(pfn_to_page(pfn), nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1697,7 +1705,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(zone, -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9766f2ffaebeb..de309a1dfe652 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7254,6 +7254,9 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; +#if defined(CONFIG_MEMORY_HOTPLUG) + zone->present_early_pages = real_size; +#endif totalpages += size; realtotalpages += real_size; From 2b9c2d6927376aaad974a13562732d5d2206f5b0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:41 +1000 Subject: [PATCH 249/365] mm/memory_hotplug: introduce "auto-movable" online policy When onlining without specifying a zone (using "online" instead of "online_kernel" or "online_movable"), we currently select a zone such that existing zones are kept contiguous. This online policy made sense in the past, where contiguous zones where required. We'd like to implement smarter policies, however: * User space has little insight. As one example, it has no idea which memory blocks logically belong together (e.g., to a DIMM or to a virtio-mem device). * Drivers that add memory in separate memory blocks, especially virtio-mem, want memory to get onlined right from the kernel when adding. So we really want to have onlining to differing zones managed in the kernel, configured by user space. We see more and more cases where we might eventually hotplug a lot of memory in the future (e.g., eventually grow a 2 GiB VM to 64 GiB), however: * Resizing happens dynamically, in smaller steps in both directions (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...) * We still want as much flexibility as possible, especially, hotunplugging as much memory as possible later. We can really only use "online_movable" if we know that the amount of memory we are going to hotplug upfront, and we know that it won't result in a zone imbalance. So in our example, a 2 GiB VM that could grow to 64 GiB could currently not use "online_movable", and instead, "online_kernel" would have to be used, resulting in worse (no) memory hotunplug reliability. Let's add a new "auto-movable" online policy that considers the current zone ratios (global, per-node) to determine, whether we a memory block can be onlined to ZONE_MOVABLE: MOVABLE : KERNEL However, internally we'll only consider the following ratio for now: MOVABLE : KERNEL_EARLY For now, we don't allow for hotplugged KERNEL memory to allow for more MOVABLE memory, because there is no coordination across memory devices. In follow-up patches, we will allow for more KERNEL memory within a memory device to allow for more MOVABLE memory within the same memory device -- which only makes sense for special memory device types. We base our calculation on "present pages", see the code comments for details. Hotplugged memory will get online to ZONE_MOVABLE if the configured ratio allows for it. Depending on the setup, this can result in fragmented zones, which can make compaction slower and dynamic allocation of gigantic pages when not using CMA less reliable (... which is already pretty unreliable). The old policy will be the default and called "contig-zones". In follow-up patches, our new policy will use additional information, such as memory groups, to make even smarter decisions across memory blocks. Configuration: * memory_hotplug.online_policy is used to switch between both polices and defaults to "contig-zones". * memory_hotplug.auto_movable_ratio defines the maximum ratio is in percent and defaults to "301" -- allowing e.g., most 8 GiB machines to grow to 32 GiB and have all hotplugged memory in ZONE_MOVABLE. The additional percent accounts for a handful of lost present pages (e.g., firmware allocations). User space is expected to adjust this ratio when enabling the new "auto-movable" policy, though. * memory_hotplug.auto_movable_numa_aware considers numa node stats in addition to global stats, and defaults to "true". Note: just like the old policy, the new policy won't take things like unmovable huge pages or memory ballooning that doesn't support balloon compaction into account. User space has to configure onlining accordingly. Link: https://lkml.kernel.org/r/20210806124715.17090-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 191 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 41a2a3977aac9..c8a7d8ffb6203 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,6 +52,73 @@ module_param(memmap_on_memory, bool, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); #endif +enum { + ONLINE_POLICY_CONTIG_ZONES = 0, + ONLINE_POLICY_AUTO_MOVABLE, +}; + +const char *online_policy_to_str[] = { + [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", + [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", +}; + +static int set_online_policy(const char *val, const struct kernel_param *kp) +{ + int ret = sysfs_match_string(online_policy_to_str, val); + + if (ret < 0) + return ret; + *((int *)kp->arg) = ret; + return 0; +} + +static int get_online_policy(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); +} + +/* + * memory_hotplug.online_policy: configure online behavior when onlining without + * specifying a zone (MMOP_ONLINE) + * + * "contig-zones": keep zone contiguous + * "auto-movable": online memory to ZONE_MOVABLE if the configuration + * (auto_movable_ratio, auto_movable_numa_aware) allows for it + */ +static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; +static const struct kernel_param_ops online_policy_ops = { + .set = set_online_policy, + .get = get_online_policy, +}; +module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); +MODULE_PARM_DESC(online_policy, + "Set the online policy (\"contig-zones\", \"auto-movable\") " + "Default: \"contig-zones\""); + +/* + * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio + * + * The ratio represent an upper limit and the kernel might decide to not + * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory + * doesn't allow for more MOVABLE memory. + */ +static unsigned int auto_movable_ratio __read_mostly = 301; +module_param(auto_movable_ratio, uint, 0644); +MODULE_PARM_DESC(auto_movable_ratio, + "Set the maximum ratio of MOVABLE:KERNEL memory in the system " + "in percent for \"auto-movable\" online policy. Default: 301"); + +/* + * memory_hotplug.auto_movable_numa_aware: consider numa node stats + */ +#ifdef CONFIG_NUMA +static bool auto_movable_numa_aware __read_mostly = true; +module_param(auto_movable_numa_aware, bool, 0644); +MODULE_PARM_DESC(auto_movable_numa_aware, + "Consider numa node stats in addition to global stats in " + "\"auto-movable\" online policy. Default: true"); +#endif /* CONFIG_NUMA */ + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -663,6 +730,61 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, set_zone_contiguous(zone); } +struct auto_movable_stats { + unsigned long kernel_early_pages; + unsigned long movable_pages; +}; + +static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, + struct zone *zone) +{ + if (zone_idx(zone) == ZONE_MOVABLE) { + stats->movable_pages += zone->present_pages; + } else { + stats->kernel_early_pages += zone->present_early_pages; +#ifdef CONFIG_CMA + /* + * CMA pages (never on hotplugged memory) behave like + * ZONE_MOVABLE. + */ + stats->movable_pages += zone->cma_pages; + stats->kernel_early_pages -= zone->cma_pages; +#endif /* CONFIG_CMA */ + } +} + +static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) +{ + struct auto_movable_stats stats = {}; + unsigned long kernel_early_pages, movable_pages; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + int i; + + /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ + if (nid == NUMA_NO_NODE) { + /* TODO: cache values */ + for_each_populated_zone(zone) + auto_movable_stats_account_zone(&stats, zone); + } else { + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (populated_zone(zone)) + auto_movable_stats_account_zone(&stats, zone); + } + } + + kernel_early_pages = stats.kernel_early_pages; + movable_pages = stats.movable_pages; + + /* + * Test if we could online the given number of pages to ZONE_MOVABLE + * and still stay in the configured ratio. + */ + movable_pages += nr_pages; + return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; +} + /* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go @@ -684,6 +806,72 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn return &pgdat->node_zones[ZONE_NORMAL]; } +/* + * Determine to which zone to online memory dynamically based on user + * configuration and system stats. We care about the following ratio: + * + * MOVABLE : KERNEL + * + * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in + * one of the kernel zones. CMA pages inside one of the kernel zones really + * behaves like ZONE_MOVABLE, so we treat them accordingly. + * + * We don't allow for hotplugged memory in a KERNEL zone to increase the + * amount of MOVABLE memory we can have, so we end up with: + * + * MOVABLE : KERNEL_EARLY + * + * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze + * boot. We base our calculation on KERNEL_EARLY internally, because: + * + * a) Hotplugged memory in one of the kernel zones can sometimes still get + * hotunplugged, especially when hot(un)plugging individual memory blocks. + * There is no coordination across memory devices, therefore "automatic" + * hotunplugging, as implemented in hypervisors, could result in zone + * imbalances. + * b) Early/boot memory in one of the kernel zones can usually not get + * hotunplugged again (e.g., no firmware interface to unplug, fragmented + * with unmovable allocations). While there are corner cases where it might + * still work, it is barely relevant in practice. + * + * We rely on "present pages" instead of "managed pages", as the latter is + * highly unreliable and dynamic in virtualized environments, and does not + * consider boot time allocations. For example, memory ballooning adjusts the + * managed pages when inflating/deflating the balloon, and balloon compaction + * can even migrate inflated pages between zones. + * + * Using "present pages" is better but some things to keep in mind are: + * + * a) Some memblock allocations, such as for the crashkernel area, are + * effectively unused by the kernel, yet they account to "present pages". + * Fortunately, these allocations are comparatively small in relevant setups + * (e.g., fraction of system memory). + * b) Some hotplugged memory blocks in virtualized environments, esecially + * hotplugged by virtio-mem, look like they are completely present, however, + * only parts of the memory block are actually currently usable. + * "present pages" is an upper limit that can get reached at runtime. As + * we base our calculations on KERNEL_EARLY, this is not an issue. + */ +static struct zone *auto_movable_zone_for_pfn(int nid, unsigned long pfn, + unsigned long nr_pages) +{ + if (!auto_movable_ratio) + goto kernel_zone; + + if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) + goto kernel_zone; + +#ifdef CONFIG_NUMA + if (auto_movable_numa_aware && + !auto_movable_can_online_movable(nid, nr_pages)) + goto kernel_zone; +#endif /* CONFIG_NUMA */ + + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; +kernel_zone: + return default_kernel_zone_for_pfn(nid, pfn, nr_pages); +} + static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { @@ -717,6 +905,9 @@ struct zone *zone_for_pfn_range(int online_type, int nid, if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) + return auto_movable_zone_for_pfn(nid, start_pfn, nr_pages); + return default_zone_for_pfn(nid, start_pfn, nr_pages); } From 042bfa82903718c13191a0fb065845afe791fbc0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:42 +1000 Subject: [PATCH 250/365] drivers/base/memory: introduce "memory groups" to logically group memory blocks In our "auto-movable" memory onlining policy, we want to make decisions across memory blocks of a single memory device. Examples of memory devices include ACPI memory devices (in the simplest case a single DIMM) and virtio-mem. For now, we don't have a connection between a single memory block device and the real memory device. Each memory device consists of 1..X memory block devices. Let's logically group memory blocks belonging to the same memory device in "memory groups". Memory groups can span multiple physical ranges and a memory group itself does not contain any information regarding physical ranges, only properties (e.g., "max_pages") necessary for improved memory onlining. Introduce two memory group types: 1) Static memory group: E.g., a single ACPI memory device, consisting of 1..X memory resources. A memory group consists of 1..Y memory blocks. The whole group is added/removed in one go. If any part cannot get offlined, the whole group cannot be removed. 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is dynamically added/removed in a fixed granularity, called a "unit", consisting of 1..X memory blocks. A unit is added/removed in one go. If any part of a unit cannot get offlined, the whole unit cannot be removed. In case of 1) we usually want either all memory managed by ZONE_MOVABLE or none. In case of 2) we usually want to have as many units as possible managed by ZONE_MOVABLE. We want a single unit to be of the same type. For now, memory groups are an internal concept that is not exposed to user space; we might want to change that in the future, though. add_memory() users can specify a mgid instead of a nid when passing the MHP_NID_IS_MGID flag. Link: https://lkml.kernel.org/r/20210806124715.17090-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 159 ++++++++++++++++++++++++++++++++- include/linux/memory.h | 46 +++++++++- include/linux/memory_hotplug.h | 5 ++ mm/memory_hotplug.c | 11 ++- 4 files changed, 215 insertions(+), 6 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b8a596a5f07c5..5b9ef29e23aac 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -82,6 +82,11 @@ static struct bus_type memory_subsys = { */ static DEFINE_XARRAY(memory_blocks); +/* + * Memory groups, indexed by memory group id (mgid). + */ +static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); + static BLOCKING_NOTIFIER_HEAD(memory_chain); int register_memory_notifier(struct notifier_block *nb) @@ -634,7 +639,8 @@ int register_memory(struct memory_block *memory) } static int init_memory_block(unsigned long block_id, unsigned long state, - unsigned long nr_vmemmap_pages) + unsigned long nr_vmemmap_pages, + struct memory_group *group) { struct memory_block *mem; int ret = 0; @@ -652,6 +658,12 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->state = state; mem->nid = NUMA_NO_NODE; mem->nr_vmemmap_pages = nr_vmemmap_pages; + INIT_LIST_HEAD(&mem->group_next); + + if (group) { + mem->group = group; + list_add(&mem->group_next, &group->memory_blocks); + } ret = register_memory(mem); @@ -671,7 +683,7 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0); + MEM_ONLINE, 0, NULL); } static void unregister_memory(struct memory_block *memory) @@ -681,6 +693,11 @@ static void unregister_memory(struct memory_block *memory) WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); + if (memory->group) { + list_del(&memory->group_next); + memory->group = NULL; + } + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); @@ -694,7 +711,8 @@ static void unregister_memory(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages) + unsigned long vmemmap_pages, + struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); @@ -707,7 +725,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, + group); if (ret) break; } @@ -891,3 +910,135 @@ int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) return bus_for_each_dev(&memory_subsys, NULL, &cb_data, for_each_memory_block_cb); } + +/* + * This is an internal helper to unify allocation and initialization of + * memory groups. Note that the passed memory group will be copied to a + * dynamically allocated memory group. After this call, the passed + * memory group should no longer be used. + */ +static int memory_group_register(struct memory_group group) +{ + struct memory_group *new_group; + uint32_t mgid; + int ret; + + if (!node_possible(group.nid)) + return -EINVAL; + + new_group = kzalloc(sizeof(group), GFP_KERNEL); + if (!new_group) + return -ENOMEM; + *new_group = group; + INIT_LIST_HEAD(&new_group->memory_blocks); + + ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, + GFP_KERNEL); + if (ret) { + kfree(new_group); + return ret; + } + return mgid; +} + +/** + * memory_group_register_static() - Register a static memory group. + * @nid: The node id. + * @max_pages: The maximum number of pages we'll have in this static memory + * group. + * + * Register a new static memory group and return the memory group id. + * All memory in the group belongs to a single unit, such as a DIMM. All + * memory belonging to a static memory group is added in one go to be removed + * in one go -- it's static. + * + * Returns an error if out of memory, if the node id is invalid, if no new + * memory groups can be registered, or if max_pages is invalid (0). Otherwise, + * returns the new memory group id. + */ +int memory_group_register_static(int nid, unsigned long max_pages) +{ + struct memory_group group = { + .nid = nid, + .s = { + .max_pages = max_pages, + }, + }; + + if (!max_pages) + return -EINVAL; + return memory_group_register(group); +} +EXPORT_SYMBOL_GPL(memory_group_register_static); + +/** + * memory_group_register_dynamic() - Register a dynamic memory group. + * @nid: The node id. + * @unit_pages: Unit in pages in which is memory added/removed in this dynamic + * memory group. + * + * Register a new dynamic memory group and return the memory group id. + * Memory within a dynamic memory group is added/removed dynamically + * in unit_pages. + * + * Returns an error if out of memory, if the node id is invalid, if no new + * memory groups can be registered, or if unit_pages is invalid (0, not a + * power of two, smaller than a single memory block). Otherwise, returns the + * new memory group id. + */ +int memory_group_register_dynamic(int nid, unsigned long unit_pages) +{ + struct memory_group group = { + .nid = nid, + .is_dynamic = true, + .d = { + .unit_pages = unit_pages, + }, + }; + + if (!unit_pages || !is_power_of_2(unit_pages) || + unit_pages < PHYS_PFN(memory_block_size_bytes())) + return -EINVAL; + return memory_group_register(group); +} +EXPORT_SYMBOL_GPL(memory_group_register_dynamic); + +/** + * memory_group_unregister() - Unregister a memory group. + * @mgid: the memory group id + * + * Unregister a memory group. If any memory block still belongs to this + * memory group, unregistering will fail. + * + * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some + * memory blocks still belong to this memory group and returns 0 if + * unregistering succeeded. + */ +int memory_group_unregister(int mgid) +{ + struct memory_group *group; + + if (mgid < 0) + return -EINVAL; + + group = xa_load(&memory_groups, mgid); + if (!group) + return -EINVAL; + if (!list_empty(&group->memory_blocks)) + return -EBUSY; + xa_erase(&memory_groups, mgid); + kfree(group); + return 0; +} +EXPORT_SYMBOL_GPL(memory_group_unregister); + +/* + * This is an internal helper only to be used in core memory hotplug code to + * lookup a memory group. We don't care about locking, as we don't expect a + * memory group to get unregistered while adding memory to it -- because + * the group and the memory is managed by the same driver. + */ +struct memory_group *memory_group_find_by_id(int mgid) +{ + return xa_load(&memory_groups, mgid); +} diff --git a/include/linux/memory.h b/include/linux/memory.h index d9a0b61cd4329..d3581e2e3fbd4 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -23,6 +23,42 @@ #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) +/** + * struct memory_group - a logical group of memory blocks + * @nid: The node id for all memory blocks inside the memory group. + * @blocks: List of all memory blocks belonging to this memory group. + * @is_dynamic: The memory group type: static vs. dynamic + * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum + * number of pages we'll have in this static memory group. + * @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages + * in which memory is added/removed in this dynamic memory group. + * This granularity defines the alignment of a unit in physical + * address space; it has to be at least as big as a single + * memory block. + * + * A memory group logically groups memory blocks; each memory block + * belongs to at most one memory group. A memory group corresponds to + * a memory device, such as a DIMM or a NUMA node, which spans multiple + * memory blocks and might even span multiple non-contiguous physical memory + * ranges. + * + * Modification of members after registration is serialized by memory + * hot(un)plug code. + */ +struct memory_group { + int nid; + struct list_head memory_blocks; + bool is_dynamic; + union { + struct { + unsigned long max_pages; + } s; + struct { + unsigned long unit_pages; + } d; + }; +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -34,6 +70,8 @@ struct memory_block { * lay at the beginning of the memory block. */ unsigned long nr_vmemmap_pages; + struct memory_group *group; /* group (if any) for this block */ + struct list_head group_next; /* next block inside memory group */ }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -86,7 +124,8 @@ static inline int memory_notify(unsigned long val, void *v) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages); + unsigned long vmemmap_pages, + struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); @@ -96,6 +135,11 @@ extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<nid; + } + if (!node_possible(nid)) { WARN(1, "node %d was absent from the node_possible_map\n", nid); return -EINVAL; @@ -1303,7 +1311,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.alloc); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc, + group); if (ret) { arch_remove_memory(start, size, NULL); goto error; From 62e87f2086b027716b05ecc66f33b5c19983690d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:42 +1000 Subject: [PATCH 251/365] mm/memory_hotplug: track present pages in memory groups Let's track all present pages in each memory group. Especially, track memory present in ZONE_MOVABLE and memory present in one of the kernel zones (which really only is ZONE_NORMAL right now as memory groups only apply to hotplugged memory) separately within a memory group, to prepare for making smart auto-online decision for individual memory blocks within a memory group based on group statistics. Link: https://lkml.kernel.org/r/20210806124715.17090-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 10 +++++----- include/linux/memory.h | 6 ++++++ include/linux/memory_hotplug.h | 13 +++++++++---- mm/memory_hotplug.c | 19 ++++++++++++++----- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5b9ef29e23aac..d38a7164f93e4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static int memory_block_online(struct memory_block *mem) } ret = online_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages, zone); + nr_pages - nr_vmemmap_pages, zone, mem->group); if (ret) { if (nr_vmemmap_pages) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); @@ -210,7 +210,7 @@ static int memory_block_online(struct memory_block *mem) * now already properly populated. */ if (nr_vmemmap_pages) - adjust_present_page_count(pfn_to_page(start_pfn), + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, nr_vmemmap_pages); return ret; @@ -228,16 +228,16 @@ static int memory_block_offline(struct memory_block *mem) * can properly be torn down in offline_pages(). */ if (nr_vmemmap_pages) - adjust_present_page_count(pfn_to_page(start_pfn), + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages); + nr_pages - nr_vmemmap_pages, mem->group); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), - nr_vmemmap_pages); + mem->group, nr_vmemmap_pages); return ret; } diff --git a/include/linux/memory.h b/include/linux/memory.h index d3581e2e3fbd4..1e9c7d01d4f2d 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -27,6 +27,10 @@ * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. * @blocks: List of all memory blocks belonging to this memory group. + * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this + * memory group. + * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this + * memory group. * @is_dynamic: The memory group type: static vs. dynamic * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum * number of pages we'll have in this static memory group. @@ -48,6 +52,8 @@ struct memory_group { int nid; struct list_head memory_blocks; + unsigned long present_kernel_pages; + unsigned long present_movable_pages; bool is_dynamic; union { struct { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5d341978b4bc2..cf3f423c8a740 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -12,6 +12,7 @@ struct zone; struct pglist_data; struct mem_section; struct memory_block; +struct memory_group; struct resource; struct vmem_altmap; @@ -100,13 +101,15 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct page *page, long nr_pages); +extern void adjust_present_page_count(struct page *page, + struct memory_group *group, + long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, struct memory_group *group); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -296,7 +299,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE extern void try_offline_node(int nid); -extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); +extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group); extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -304,7 +308,8 @@ extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} -static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { return -EINVAL; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c4010b3648b6..933a4eba0ed16 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -915,9 +915,11 @@ struct zone *zone_for_pfn_range(int online_type, int nid, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct page *page, long nr_pages) +void adjust_present_page_count(struct page *page, struct memory_group *group, + long nr_pages) { struct zone *zone = page_zone(page); + const bool movable = zone_idx(zone) == ZONE_MOVABLE; /* * We only support onlining/offlining/adding/removing of complete @@ -927,6 +929,11 @@ void adjust_present_page_count(struct page *page, long nr_pages) zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; + + if (group && movable) + group->present_movable_pages += nr_pages; + else if (group && !movable) + group->present_kernel_pages += nr_pages; } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, @@ -972,7 +979,8 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) { unsigned long flags; int need_zonelists_rebuild = 0; @@ -1025,7 +1033,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(pfn_to_page(pfn), nr_pages); + adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1769,7 +1777,8 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; @@ -1905,7 +1914,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); From a5eb9bf4563a186c43b69a151bf3fa0d32791b06 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:42 +1000 Subject: [PATCH 252/365] ACPI: memhotplug: use a single static memory group for a single memory device Let's group all memory we add for a single memory device - we want a single node for that (which also seems to be the sane thing to do). We won't care for now about memory that was already added to the system (e.g., via e820) -- usually *all* memory of a memory device was already added and we'll fail acpi_memory_enable_device(). Link: https://lkml.kernel.org/r/20210806124715.17090-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Rafael J. Wysocki Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/acpi/acpi_memhotplug.c | 35 +++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index eb4faf7c5cad0..24f662d8bd39a 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -54,6 +54,7 @@ struct acpi_memory_info { struct acpi_memory_device { struct acpi_device *device; struct list_head res_list; + int mgid; }; static acpi_status @@ -169,12 +170,33 @@ static void acpi_unbind_memory_blocks(struct acpi_memory_info *info) static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) { acpi_handle handle = mem_device->device->handle; + mhp_t mhp_flags = MHP_NID_IS_MGID; int result, num_enabled = 0; struct acpi_memory_info *info; - mhp_t mhp_flags = MHP_NONE; - int node; + u64 total_length = 0; + int node, mgid; node = acpi_get_node(handle); + + list_for_each_entry(info, &mem_device->res_list, list) { + if (!info->length) + continue; + /* We want a single node for the whole memory group */ + if (node < 0) + node = memory_add_physaddr_to_nid(info->start_addr); + total_length += info->length; + } + + if (!total_length) { + dev_err(&mem_device->device->dev, "device is empty\n"); + return -EINVAL; + } + + mgid = memory_group_register_static(node, PFN_UP(total_length)); + if (mgid < 0) + return mgid; + mem_device->mgid = mgid; + /* * Tell the VM there is more memory here... * Note: Assume that this function returns zero on success @@ -188,12 +210,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) */ if (!info->length) continue; - if (node < 0) - node = memory_add_physaddr_to_nid(info->start_addr); if (mhp_supports_memmap_on_memory(info->length)) mhp_flags |= MHP_MEMMAP_ON_MEMORY; - result = __add_memory(node, info->start_addr, info->length, + result = __add_memory(mgid, info->start_addr, info->length, mhp_flags); /* @@ -253,6 +273,10 @@ static void acpi_memory_device_free(struct acpi_memory_device *mem_device) if (!mem_device) return; + /* In case we succeeded adding *some* memory, unregistering fails. */ + if (mem_device->mgid >= 0) + memory_group_unregister(mem_device->mgid); + acpi_memory_free_device_resources(mem_device); mem_device->device->driver_data = NULL; kfree(mem_device); @@ -273,6 +297,7 @@ static int acpi_memory_device_add(struct acpi_device *device, INIT_LIST_HEAD(&mem_device->res_list); mem_device->device = device; + mem_device->mgid = -1; sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); device->driver_data = mem_device; From 4bf2a8bcfdf7854ad64cfa3cb380b03850479703 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:42 +1000 Subject: [PATCH 253/365] dax/kmem: use a single static memory group for a single probed unit Although dax/kmem users often disable auto-onlining and instead online memory manually (usually to ZONE_MOVABLE), there is still value in having auto-onlining be aware of the relationship of memory blocks. Let's treat one probed unit as a single static memory device, similar to a single ACPI memory device. Link: https://lkml.kernel.org/r/20210806124715.17090-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/dax/kmem.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 99e0f60c4c266..a37622060fffa 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -37,15 +37,16 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) struct dax_kmem_data { const char *res_name; + int mgid; struct resource *res[]; }; static int dev_dax_kmem_probe(struct dev_dax *dev_dax) { struct device *dev = &dev_dax->dev; + unsigned long total_len = 0; struct dax_kmem_data *data; - int rc = -ENOMEM; - int i, mapped = 0; + int i, rc, mapped = 0; int numa_node; /* @@ -61,24 +62,44 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return -EINVAL; } + for (i = 0; i < dev_dax->nr_range; i++) { + struct range range; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) { + dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", + i, range.start, range.end); + continue; + } + total_len += range_len(&range); + } + + if (!total_len) { + dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); + return -EINVAL; + } + data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); if (!data) return -ENOMEM; + rc = -ENOMEM; data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); if (!data->res_name) goto err_res_name; + rc = memory_group_register_static(numa_node, total_len); + if (rc < 0) + goto err_reg_mgid; + data->mgid = rc; + for (i = 0; i < dev_dax->nr_range; i++) { struct resource *res; struct range range; rc = dax_kmem_range(dev_dax, i, &range); - if (rc) { - dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", - i, range.start, range.end); + if (rc) continue; - } /* Region is permanently reserved if hotremove fails. */ res = request_mem_region(range.start, range_len(&range), data->res_name); @@ -108,8 +129,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) * Ensure that future kexec'd kernels will not treat * this as RAM automatically. */ - rc = add_memory_driver_managed(numa_node, range.start, - range_len(&range), kmem_name, MHP_NONE); + rc = add_memory_driver_managed(data->mgid, range.start, + range_len(&range), kmem_name, MHP_NID_IS_MGID); if (rc) { dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", @@ -129,6 +150,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return 0; err_request_mem: + memory_group_unregister(data->mgid); +err_reg_mgid: kfree(data->res_name); err_res_name: kfree(data); @@ -171,6 +194,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) } if (success >= dev_dax->nr_range) { + memory_group_unregister(data->mgid); kfree(data->res_name); kfree(data); dev_set_drvdata(dev, NULL); From 07da67814addb9d40ee535257d461f3792ba786f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:42 +1000 Subject: [PATCH 254/365] virtio-mem: use a single dynamic memory group for a single virtio-mem device Let's use a single dynamic memory group. Link: https://lkml.kernel.org/r/20210806124715.17090-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/virtio/virtio_mem.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 774986695dc48..8945f248ce69e 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -143,6 +143,8 @@ struct virtio_mem { * add_memory_driver_managed(). */ const char *resource_name; + /* Memory group identification. */ + int mgid; /* * We don't want to add too much memory if it's not getting onlined, @@ -626,8 +628,8 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, addr + size - 1); /* Memory might get onlined immediately. */ atomic64_add(size, &vm->offline_size); - rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, - MHP_MERGE_RESOURCE); + rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, + MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); if (rc) { atomic64_sub(size, &vm->offline_size); dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); @@ -2562,6 +2564,7 @@ static bool virtio_mem_has_memory_added(struct virtio_mem *vm) static int virtio_mem_probe(struct virtio_device *vdev) { struct virtio_mem *vm; + uint64_t unit_pages; int rc; BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); @@ -2596,6 +2599,16 @@ static int virtio_mem_probe(struct virtio_device *vdev) if (rc) goto out_del_vq; + /* use a single dynamic memory group to cover the whole memory device */ + if (vm->in_sbm) + unit_pages = PHYS_PFN(memory_block_size_bytes()); + else + unit_pages = PHYS_PFN(vm->bbm.bb_size); + rc = memory_group_register_dynamic(vm->nid, unit_pages); + if (rc < 0) + goto out_del_resource; + vm->mgid = rc; + /* * If we still have memory plugged, we have to unplug all memory first. * Registering our parent resource makes sure that this memory isn't @@ -2610,7 +2623,7 @@ static int virtio_mem_probe(struct virtio_device *vdev) vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; rc = register_memory_notifier(&vm->memory_notifier); if (rc) - goto out_del_resource; + goto out_unreg_group; rc = register_virtio_mem_device(vm); if (rc) goto out_unreg_mem; @@ -2624,6 +2637,8 @@ static int virtio_mem_probe(struct virtio_device *vdev) return 0; out_unreg_mem: unregister_memory_notifier(&vm->memory_notifier); +out_unreg_group: + memory_group_unregister(vm->mgid); out_del_resource: virtio_mem_delete_resource(vm); out_del_vq: @@ -2688,6 +2703,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) } else { virtio_mem_delete_resource(vm); kfree_const(vm->resource_name); + memory_group_unregister(vm->mgid); } /* remove all tracking data - no locking needed */ From 345cfbed28d7be3aa351d1ef2bbbf72c5eedb54a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:43 +1000 Subject: [PATCH 255/365] mm/memory_hotplug: memory group aware "auto-movable" online policy Use memory groups to improve our "auto-movable" onlining policy: 1. For static memory groups (e.g., a DIMM), online a memory block MOVABLE only if all other memory blocks in the group are either MOVABLE or could be onlined MOVABLE. A DIMM will either be MOVABLE or not, not a mixture. 2. For dynamic memory groups (e.g., a virtio-mem device), online a memory block MOVABLE only if all other memory blocks inside the current unit are either MOVABLE or could be onlined MOVABLE. For a virtio-mem device with a device block size with 512 MiB, all 128 MiB memory blocks wihin a 512 MiB unit will either be MOVABLE or not, not a mixture. We have to pass the memory group to zone_for_pfn_range() to take the memory group into account. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 18 +++++++------ include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 48 +++++++++++++++++++++++++++++++--- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index d38a7164f93e4..50dcd8ecc59b1 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -182,7 +182,8 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; - zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages); + zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, + start_pfn, nr_pages); /* * Although vmemmap pages have a different lifecycle than the pages @@ -379,12 +380,13 @@ static ssize_t phys_device_show(struct device *dev, #ifdef CONFIG_MEMORY_HOTREMOVE static int print_allowed_zone(char *buf, int len, int nid, + struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages, int online_type, struct zone *default_zone) { struct zone *zone; - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); + zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); if (zone == default_zone) return 0; @@ -397,9 +399,10 @@ static ssize_t valid_zones_show(struct device *dev, struct memory_block *mem = to_memory_block(dev); unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + struct memory_group *group = mem->group; struct zone *default_zone; + int nid = mem->nid; int len = 0; - int nid; /* * Check the existing zone. Make sure that we do that only on the @@ -418,14 +421,13 @@ static ssize_t valid_zones_show(struct device *dev, goto out; } - nid = mem->nid; - default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, - nr_pages); + default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, + start_pfn, nr_pages); len += sysfs_emit_at(buf, len, "%s", default_zone->name); - len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages, + len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, default_zone); - len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages, + len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE, default_zone); out: len += sysfs_emit_at(buf, len, "\n"); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index cf3f423c8a740..e5a867c950b27 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -349,7 +349,8 @@ extern void sparse_remove_section(struct mem_section *ms, extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern struct zone *zone_for_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages); + struct memory_group *group, unsigned long start_pfn, + unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 933a4eba0ed16..0c1124f0b9544 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -852,12 +852,53 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * "present pages" is an upper limit that can get reached at runtime. As * we base our calculations on KERNEL_EARLY, this is not an issue. */ -static struct zone *auto_movable_zone_for_pfn(int nid, unsigned long pfn, +static struct zone *auto_movable_zone_for_pfn(int nid, + struct memory_group *group, + unsigned long pfn, unsigned long nr_pages) { + unsigned long online_pages = 0, max_pages, end_pfn; + struct page *page; + if (!auto_movable_ratio) goto kernel_zone; + if (group && !group->is_dynamic) { + max_pages = group->s.max_pages; + online_pages = group->present_movable_pages; + + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (group->present_kernel_pages) + goto kernel_zone; + } else if (!group || group->d.unit_pages == nr_pages) { + max_pages = nr_pages; + } else { + max_pages = group->d.unit_pages; + /* + * Take a look at all online sections in the current unit. + * We can safely assume that all pages within a section belong + * to the same zone, because dynamic memory groups only deal + * with hotplugged memory. + */ + pfn = ALIGN_DOWN(pfn, group->d.unit_pages); + end_pfn = pfn + group->d.unit_pages; + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (page_zonenum(page) != ZONE_MOVABLE) + goto kernel_zone; + online_pages += PAGES_PER_SECTION; + } + } + + /* + * Online MOVABLE if we could *currently* online all remaining parts + * MOVABLE. We expect to (add+) online them immediately next, so if + * nobody interferes, all will be MOVABLE if possible. + */ + nr_pages = max_pages - online_pages; if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) goto kernel_zone; @@ -897,7 +938,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn } struct zone *zone_for_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) + struct memory_group *group, unsigned long start_pfn, + unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); @@ -906,7 +948,7 @@ struct zone *zone_for_pfn_range(int online_type, int nid, return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) - return auto_movable_zone_for_pfn(nid, start_pfn, nr_pages); + return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); return default_zone_for_pfn(nid, start_pfn, nr_pages); } From 42b9b512368ae8a25953e8caf2a6c70b6750a0f6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Aug 2021 09:59:43 +1000 Subject: [PATCH 256/365] mm/memory_hotplug: improved dynamic memory group aware "auto-movable" online policy Currently, the "auto-movable" online policy does not allow for hotplugged KERNEL (ZONE_NORMAL) memory to increase the amount of MOVABLE memory we can have, primarily, because there is no coordiantion across memory devices and we don't want to create zone-imbalances accidentially when unplugging memory. However, within a single memory device it's different. Let's allow for KERNEL memory within a dynamic memory group to allow for more MOVABLE within the same memory group. The only thing we have to take care of is that the managing driver avoids zone imbalances by unplugging MOVABLE memory first, otherwise there can be corner cases where unplug of memory could result in (accidential) zone imbalances. virtio-mem is the only user of dynamic memory groups and recently added support for prioritizing unplug of ZONE_MOVABLE over ZONE_NORMAL, so we don't need a new toggle to enable it for dynamic memory groups. We limit this handling to dynamic memory groups, because: * We want to keep the runtime overhead for collecting stats when onlining a single memory block small. We tend to have only a handful of dynamic memory groups, but we can have quite some static memory groups (e.g., 256 DIMMs). * It doesn't make too much sense for static memory groups, as we try onlining all applicable memory blocks either completely to ZONE_MOVABLE or not. In ordinary operation, we won't have a mixture of zones within a static memory group. When adding memory to a dynamic memory group, we'll first online memory to ZONE_MOVABLE as long as early KERNEL memory allows for it. Then, we'll online the next unit(s) to ZONE_NORMAL, until we can online the next unit(s) to ZONE_MOVABLE. For a simple virtio-mem device with a MOVABLE:KERNEL ratio of 3:1, it will result in a layout like: [M][M][M][M][M][M][M][M][N][M][M][M][N][M][M][M]... ^ movable memory due to early kernel memory ^ allows for more movable memory ... ^-----^ ... here ^ allows for more movable memory ... ^-----^ ... here While the created layout is sub-optimal when it comes to contiguous zones, it gives us the maximum flexibility when dynamically growing/shrinking a device; we can grow small VMs really big in small steps, and still shrink reliably to e.g., 1/4 of the maximum VM size in this example, removing full memory blocks along with meta data more reliably. Mark dynamic memory groups in the xarray such that we can efficiently iterate over them when collecting stats. In usual setups, we have one virtio-mem device per NUMA node, and usually only a small number of NUMA nodes. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 30 +++++++++++++++++++++ include/linux/memory.h | 3 +++ mm/memory_hotplug.c | 60 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 89 insertions(+), 4 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 50dcd8ecc59b1..365cd4a7f2397 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -86,6 +86,7 @@ static DEFINE_XARRAY(memory_blocks); * Memory groups, indexed by memory group id (mgid). */ static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); +#define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 static BLOCKING_NOTIFIER_HEAD(memory_chain); @@ -939,6 +940,8 @@ static int memory_group_register(struct memory_group group) if (ret) { kfree(new_group); return ret; + } else if (group.is_dynamic) { + xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); } return mgid; } @@ -1044,3 +1047,30 @@ struct memory_group *memory_group_find_by_id(int mgid) { return xa_load(&memory_groups, mgid); } + +/* + * This is an internal helper only to be used in core memory hotplug code to + * walk all dynamic memory groups excluding a given memory group, either + * belonging to a specific node, or belonging to any node. + */ +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg) +{ + struct memory_group *group; + unsigned long index; + int ret = 0; + + xa_for_each_marked(&memory_groups, index, group, + MEMORY_GROUP_MARK_DYNAMIC) { + if (group == excluded) + continue; +#ifdef CONFIG_NUMA + if (nid != NUMA_NO_NODE && group->nid != nid) + continue; +#endif /* CONFIG_NUMA */ + ret = func(group, arg); + if (ret) + break; + } + return ret; +} diff --git a/include/linux/memory.h b/include/linux/memory.h index 1e9c7d01d4f2d..7efc0a7c14c9d 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -146,6 +146,9 @@ extern int memory_group_register_static(int nid, unsigned long max_pages); extern int memory_group_register_dynamic(int nid, unsigned long unit_pages); extern int memory_group_unregister(int mgid); struct memory_group *memory_group_find_by_id(int mgid); +typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg); #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0c1124f0b9544..4f3cdc86248bd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -752,11 +752,44 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, #endif /* CONFIG_CMA */ } } +struct auto_movable_group_stats { + unsigned long movable_pages; + unsigned long req_kernel_early_pages; +}; -static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) +static int auto_movable_stats_account_group(struct memory_group *group, + void *arg) +{ + const int ratio = READ_ONCE(auto_movable_ratio); + struct auto_movable_group_stats *stats = arg; + long pages; + + /* + * We don't support modifying the config while the auto-movable online + * policy is already enabled. Just avoid the division by zero below. + */ + if (!ratio) + return 0; + + /* + * Calculate how many early kernel pages this group requires to + * satisfy the configured zone ratio. + */ + pages = group->present_movable_pages * 100 / ratio; + pages -= group->present_kernel_pages; + + if (pages > 0) + stats->req_kernel_early_pages += pages; + stats->movable_pages += group->present_movable_pages; + return 0; +} + +static bool auto_movable_can_online_movable(int nid, struct memory_group *group, + unsigned long nr_pages) { - struct auto_movable_stats stats = {}; unsigned long kernel_early_pages, movable_pages; + struct auto_movable_group_stats group_stats = {}; + struct auto_movable_stats stats = {}; pg_data_t *pgdat = NODE_DATA(nid); struct zone *zone; int i; @@ -777,6 +810,21 @@ static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) kernel_early_pages = stats.kernel_early_pages; movable_pages = stats.movable_pages; + /* + * Kernel memory inside dynamic memory group allows for more MOVABLE + * memory within the same group. Remove the effect of all but the + * current group from the stats. + */ + walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, + group, &group_stats); + if (kernel_early_pages <= group_stats.req_kernel_early_pages) + return false; + kernel_early_pages -= group_stats.req_kernel_early_pages; + movable_pages -= group_stats.movable_pages; + + if (group && group->is_dynamic) + kernel_early_pages += group->present_kernel_pages; + /* * Test if we could online the given number of pages to ZONE_MOVABLE * and still stay in the configured ratio. @@ -834,6 +882,10 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * with unmovable allocations). While there are corner cases where it might * still work, it is barely relevant in practice. * + * Exceptions are dynamic memory groups, which allow for more MOVABLE + * memory within the same memory group -- because in that case, there is + * coordination within the single memory device managed by a single driver. + * * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the @@ -899,12 +951,12 @@ static struct zone *auto_movable_zone_for_pfn(int nid, * nobody interferes, all will be MOVABLE if possible. */ nr_pages = max_pages - online_pages; - if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) + if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) goto kernel_zone; #ifdef CONFIG_NUMA if (auto_movable_numa_aware && - !auto_movable_can_online_movable(nid, nr_pages)) + !auto_movable_can_online_movable(nid, group, nr_pages)) goto kernel_zone; #endif /* CONFIG_NUMA */ From dc7657c0f637ee3160a4e57fa162378341ede9c9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:43 +1000 Subject: [PATCH 257/365] mm/memory_hotplug: use helper zone_is_zone_device() to simplify the code Patch series "Cleanup and fixups for memory hotplug". This series contains cleanup to use helper function to simplify the code. Also we fix some potential bugs. More details can be found in the respective changelogs. This patch (of 3): Use helper zone_is_zone_device() to simplify the code and remove some explicit CONFIG_ZONE_DEVICE codes. Link: https://lkml.kernel.org/r/20210821094246.10149-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210821094246.10149-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Cc: Michal Hocko Cc: Minchan Kim Cc: Chris Goldsworthy Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4f3cdc86248bd..9fd0be32a281e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -477,15 +477,13 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, sizeof(struct page) * cur_nr_pages); } -#ifdef CONFIG_ZONE_DEVICE /* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So * we will not try to shrink the zones - which is okay as * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. */ - if (zone_idx(zone) == ZONE_DEVICE) + if (zone_is_zone_device(zone)) return; -#endif clear_zone_contiguous(zone); From 4d6b69278e0457f16ac7de549b28409ccf6ed048 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:43 +1000 Subject: [PATCH 258/365] mm/memory_hotplug: make HWPoisoned dirty swapcache pages unmovable HWPoisoned dirty swapcache pages are kept for killing owner processes. We should not offline these pages or do_swap_page() would access the offline pages and lead to bad ending. Link: https://lkml.kernel.org/r/20210821094246.10149-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Chris Goldsworthy Cc: Michal Hocko Cc: Minchan Kim Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fd0be32a281e..0488eed3327c0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1664,6 +1664,12 @@ static int scan_movable_pages(unsigned long start, unsigned long end, */ if (PageOffline(page) && page_count(page)) return -EBUSY; + /* + * HWPoisoned dirty swapcache pages are definitely unmovable + * because they are kept for killing owner processes. + */ + if (PageHWPoison(page) && PageSwapCache(page)) + return -EBUSY; if (!PageHuge(page)) continue; From aa6da1c42ef3d108f5aa80237f032cd11a27ea5b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 24 Aug 2021 09:59:43 +1000 Subject: [PATCH 259/365] mm: remove redundant compound_head() calling There is a READ_ONCE() in the macro of compound_head(), which will prevent compiler from optimizing the code when there are more than once calling of it in a function. Remove the redundant calling of compound_head() from page_to_index() and page_add_file_rmap() for better code generation. Link: https://lkml.kernel.org/r/20210811101431.83940-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Howells Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Kirill A. Shutemov Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/pagemap.h | 7 +++---- mm/rmap.c | 6 ++++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 45f5ab7496cc6..5c74a45ff97a0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -571,18 +571,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping, */ static inline pgoff_t page_to_index(struct page *page) { - pgoff_t pgoff; + struct page *head; if (likely(!PageTransTail(page))) return page->index; + head = compound_head(page); /* * We don't initialize ->index for tail pages: calculate based on * head page */ - pgoff = compound_head(page)->index; - pgoff += page - compound_head(page); - return pgoff; + return head->index + page - head; } extern pgoff_t hugetlb_basepage_index(struct page *page); diff --git a/mm/rmap.c b/mm/rmap.c index b9eb5c12f3fe1..b2cebf35ffe7a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1230,11 +1230,13 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { + struct page *head = compound_head(page); + VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + SetPageDoubleMap(head); if (PageMlocked(page)) - clear_page_mlock(compound_head(page)); + clear_page_mlock(head); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; From 8e0f4444f7d4e59575000210268340610c37dac1 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Tue, 24 Aug 2021 09:59:44 +1000 Subject: [PATCH 260/365] mm/rmap: convert from atomic_t to refcount_t on anon_vma->refcount refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Link: https://lkml.kernel.org/r/1626665029-49104-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Cc: Alistair Popple Cc: Yang Shi Cc: Shakeel Butt Cc: Hugh Dickins Cc: Xiyu Yang Cc: Miaohe Lin Cc: Cc: Xin Tan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/rmap.h | 8 +++++--- mm/rmap.c | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c976cc6de2574..38151efe1a593 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,8 @@ #include #include +#include + /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: @@ -36,7 +38,7 @@ struct anon_vma { * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ - atomic_t refcount; + refcount_t refcount; /* * Count of child anon_vmas and VMAs which points to this anon_vma. @@ -100,14 +102,14 @@ enum ttu_flags { #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { - atomic_inc(&anon_vma->refcount); + refcount_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { - if (atomic_dec_and_test(&anon_vma->refcount)) + if (refcount_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index b2cebf35ffe7a..164f75371c1db 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -88,7 +88,7 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { - atomic_set(&anon_vma->refcount, 1); + refcount_set(&anon_vma->refcount, 1); anon_vma->degree = 1; /* Reference for first vma */ anon_vma->parent = anon_vma; /* @@ -103,7 +103,7 @@ static inline struct anon_vma *anon_vma_alloc(void) static inline void anon_vma_free(struct anon_vma *anon_vma) { - VM_BUG_ON(atomic_read(&anon_vma->refcount)); + VM_BUG_ON(refcount_read(&anon_vma->refcount)); /* * Synchronize against page_lock_anon_vma_read() such that @@ -445,7 +445,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); - atomic_set(&anon_vma->refcount, 0); + refcount_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } @@ -495,7 +495,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -554,7 +554,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } /* trylock failed, we got to sleep */ - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -569,7 +569,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) rcu_read_unlock(); anon_vma_lock_read(anon_vma); - if (atomic_dec_and_test(&anon_vma->refcount)) { + if (refcount_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because @@ -2223,7 +2223,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); - if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + if (root != anon_vma && refcount_dec_and_test(&root->refcount)) anon_vma_free(root); } From a05cc9abd9c28cc037a37ecff80e96caac3ec088 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:44 +1000 Subject: [PATCH 261/365] mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration() There is one possible race window between zs_pool_dec_isolated() and zs_unregister_migration() because wait_for_isolated_drain() checks the isolated count without holding class->lock and there is no order inside zs_pool_dec_isolated(). Thus the below race window could be possible: zs_pool_dec_isolated zs_unregister_migration check pool->destroying != 0 pool->destroying = true; smp_mb(); wait_for_isolated_drain() wait for pool->isolated_pages == 0 atomic_long_dec(&pool->isolated_pages); atomic_long_read(&pool->isolated_pages) == 0 Since we observe the pool->destroying (false) before atomic_long_dec() for pool->isolated_pages, waking pool->migration_wait up is missed. Fix this by ensure checking pool->destroying happens after the atomic_long_dec(&pool->isolated_pages). Link: https://lkml.kernel.org/r/20210708115027.7557-1-linmiaohe@huawei.com Fixes: 701d678599d0 ("mm/zsmalloc.c: fix race condition in zs_destroy_pool") Signed-off-by: Miaohe Lin Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Henry Burns Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/zsmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 68e8831068f4b..b897ce3b399a1 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1830,10 +1830,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool) VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); atomic_long_dec(&pool->isolated_pages); /* - * There's no possibility of racing, since wait_for_isolated_drain() - * checks the isolated count under &class->lock after enqueuing - * on migration_wait. + * Checking pool->destroying must happen after atomic_long_dec() + * for pool->isolated_pages above. Paired with the smp_mb() in + * zs_unregister_migration(). */ + smp_mb__after_atomic(); if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) wake_up_all(&pool->migration_wait); } From babe97525a5896c9a833b46b8cef8695da2fa750 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 24 Aug 2021 09:59:44 +1000 Subject: [PATCH 262/365] mm/zsmalloc.c: combine two atomic ops in zs_pool_dec_isolated() atomic_long_dec_and_test() is equivalent to atomic_long_dec() and atomic_long_read() == 0. Use it to make code more succinct. Link: https://lkml.kernel.org/r/20210624123930.1769093-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/zsmalloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b897ce3b399a1..3e713ff6261ee 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1828,14 +1828,13 @@ static void putback_zspage_deferred(struct zs_pool *pool, static inline void zs_pool_dec_isolated(struct zs_pool *pool) { VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); - atomic_long_dec(&pool->isolated_pages); /* * Checking pool->destroying must happen after atomic_long_dec() * for pool->isolated_pages above. Paired with the smp_mb() in * zs_unregister_migration(). */ smp_mb__after_atomic(); - if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) + if (atomic_long_dec_and_test(&pool->isolated_pages) && pool->destroying) wake_up_all(&pool->migration_wait); } From 1b3b8cc86ea7d18a8eceac9dc3d8ec0f756a5f4c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 24 Aug 2021 09:59:44 +1000 Subject: [PATCH 263/365] highmem: don't disable preemption on RT in kmap_atomic() kmap_atomic() disables preemption and pagefaults for historical reasons. The conversion to kmap_local(), which only disables migration, cannot be done wholesale because quite some call sites need to be updated to accommodate with the changed semantics. On PREEMPT_RT enabled kernels the kmap_atomic() semantics are problematic due to the implicit disabling of preemption which makes it impossible to acquire 'sleeping' spinlocks within the kmap atomic sections. PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for more than a decade. It could be argued that this is a justification to do this unconditionally, but PREEMPT_RT covers only a limited number of architectures and it disables some functionality which limits the coverage further. Limit the replacement to PREEMPT_RT for now. Link: https://lkml.kernel.org/r/20210810091116.pocdmaatdcogvdso@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Vlastimil Babka Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem-internal.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 7902c7d8b55f9..4aa1031d3e4c3 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr) static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); return __kmap_local_page_prot(page, prot); } @@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page) static inline void *kmap_atomic_pfn(unsigned long pfn) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } @@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void *addr) { kunmap_local_indexed(addr); pagefault_enable(); - preempt_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); } unsigned int __nr_free_highpages(void); @@ -179,7 +190,10 @@ static inline void __kunmap_local(void *addr) static inline void *kmap_atomic(struct page *page) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); pagefault_disable(); return page_address(page); } @@ -200,7 +214,10 @@ static inline void __kunmap_atomic(void *addr) kunmap_flush_on_unmap(addr); #endif pagefault_enable(); - preempt_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } From 773f49edcb8f0d4a4cda9ad71c9c41094dcbae21 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 24 Aug 2021 09:59:44 +1000 Subject: [PATCH 264/365] mm/highmem: Remove deprecated kmap_atomic kmap_atomic() is being deprecated in favor of kmap_local_page(). Replace the uses of kmap_atomic() within the highmem code. On profiling clear_huge_page() using ftrace an improvement of 62% was observed on the below setup. Setup:- Below data has been collected on Qualcomm's SM7250 SoC THP enabled (kernel v4.19.113) with only CPU-0(Cortex-A55) and CPU-7(Cortex-A76) switched on and set to max frequency, also DDR set to perf governor. FTRACE Data:- Base data:- Number of iterations: 48 Mean of allocation time: 349.5 us std deviation: 74.5 us v4 data:- Number of iterations: 48 Mean of allocation time: 131 us std deviation: 32.7 us The following simple userspace experiment to allocate 100MB(BUF_SZ) of pages and writing to it gave us a good insight, we observed an improvement of 42% in allocation and writing timings. ------------------------------------------------------------- Test code snippet ------------------------------------------------------------- clock_start(); buf = malloc(BUF_SZ); /* Allocate 100 MB of memory */ for(i=0; i < BUF_SZ_PAGES; i++) { *((int *)(buf + (i*PAGE_SIZE))) = 1; } clock_end(); ------------------------------------------------------------- Malloc test timings for 100MB anon allocation:- Base data:- Number of iterations: 100 Mean of allocation time: 31831 us std deviation: 4286 us v4 data:- Number of iterations: 100 Mean of allocation time: 18193 us std deviation: 4915 us Link: https://lkml.kernel.org/r/20210204073255.20769-2-prathu.baronia@oneplus.com Signed-off-by: Ira Weiny Signed-off-by: Prathu Baronia Cc: Ira Weiny Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379e..31ebee36f26ce 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -143,9 +143,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif @@ -177,9 +177,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, static inline void clear_highpage(struct page *page) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); clear_page(kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE @@ -202,7 +202,7 @@ static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); @@ -213,7 +213,7 @@ static inline void zero_user_segments(struct page *page, if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); - kunmap_atomic(kaddr); + kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } @@ -238,11 +238,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from, { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif @@ -253,11 +253,11 @@ static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_page(vto, vfrom); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif From c4c76efb546903d1db78c9114a41ecfa1bcc2f4d Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 24 Aug 2021 09:59:45 +1000 Subject: [PATCH 265/365] mm: in_irq() cleanup Replace the obsolete and ambiguos macro in_irq() with new macro in_hardirq(). Link: https://lkml.kernel.org/r/20210813145245.86070-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/highmem.c | 2 +- mm/kmemleak.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 4fb51d735aa6a..4212ad0e4a195 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -436,7 +436,7 @@ EXPORT_SYMBOL(zero_user_segments); static inline int kmap_local_idx_push(void) { - WARN_ON_ONCE(in_irq() && !irqs_disabled()); + WARN_ON_ONCE(in_hardirq() && !irqs_disabled()); current->kmap_ctrl.idx += KM_INCR; BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); return current->kmap_ctrl.idx - 1; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 73d46d16d5755..b59f1761d817d 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -598,7 +598,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->checksum = 0; /* task information */ - if (in_irq()) { + if (in_hardirq()) { object->pid = 0; strncpy(object->comm, "hardirq", sizeof(object->comm)); } else if (in_serving_softirq()) { From c32716ab0ebf2762331977fc1f922ca48ee8136d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 24 Aug 2021 09:59:45 +1000 Subject: [PATCH 266/365] mm: introduce PAGEFLAGS_MASK to replace ((1UL << NR_PAGEFLAGS) - 1) Instead of hard-coding ((1UL << NR_PAGEFLAGS) - 1) everywhere, introducing PAGEFLAGS_MASK to make the code clear to get the page flags. Link: https://lkml.kernel.org/r/20210819150712.59948-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/page-flags.h | 4 +++- include/trace/events/page_ref.h | 4 ++-- lib/test_printf.c | 2 +- lib/vsprintf.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7fda4fb85bdca..62dc61db9165a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -178,6 +178,8 @@ enum pageflags { PG_reported = PG_uptodate, }; +#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) + #ifndef __GENERATING_BOUNDS_H static inline unsigned long _compound_head(const struct page *page) @@ -868,7 +870,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) + (PAGEFLAGS_MASK & ~__PG_HWPOISON) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h index 5d2ea93956cec..8a99c1cd417b8 100644 --- a/include/trace/events/page_ref.h +++ b/include/trace/events/page_ref.h @@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template, TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d", __entry->pfn, - show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)), + show_page_flags(__entry->flags & PAGEFLAGS_MASK), __entry->count, __entry->mapcount, __entry->mapping, __entry->mt, __entry->val) @@ -88,7 +88,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d ret=%d", __entry->pfn, - show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)), + show_page_flags(__entry->flags & PAGEFLAGS_MASK), __entry->count, __entry->mapcount, __entry->mapping, __entry->mt, __entry->val, __entry->ret) diff --git a/lib/test_printf.c b/lib/test_printf.c index 8ac71aee46af0..ec69953cf80c3 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -614,7 +614,7 @@ page_flags_test(int section, int node, int zone, int last_cpupid, bool append = false; int i; - flags &= BIT(NR_PAGEFLAGS) - 1; + flags &= PAGEFLAGS_MASK; if (flags) { page_flags |= flags; snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 26c83943748a4..cc7bdd3ac2ee8 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2019,7 +2019,7 @@ static const struct page_flags_fields pff[] = { static char *format_page_flags(char *buf, char *end, unsigned long flags) { - unsigned long main_flags = flags & (BIT(NR_PAGEFLAGS) - 1); + unsigned long main_flags = flags & PAGEFLAGS_MASK; bool append = false; int i; From 59f23b42c3ae176ba298be52cccdf1e69fc91b90 Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Tue, 24 Aug 2021 09:59:45 +1000 Subject: [PATCH 267/365] mm/secretmem: use refcount_t instead of atomic_t When a secret memory region is active, memfd_secret disables hibernation. One of the goals is to keep the secret data from being written to persistent-storage. It accomplishes this by maintaining a reference count to `secretmem_users`. Once this reference is held your system can not be hibernated due to the check in `hibernation_available()`. However, because `secretmem_users` is of type `atomic_t`, reference counter overflows are possible. As you can see there's an `atomic_inc` for each `memfd` that is opened in the `memfd_secret` syscall. If a local attacker succeeds to open 2^32 memfd's, the counter will wrap around to 0. This implies that you may hibernate again, even though there are still regions of this secret memory, thereby bypassing the security check. In an attempt to fix this I have used `refcount_t` instead of `atomic_t` which prevents reference counter overflows. Link: https://lkml.kernel.org/r/20210820043339.2151352-1-jordy@pwning.systems Signed-off-by: Jordy Zomer Cc: Kees Cook , Cc: Jordy Zomer Cc: James Bottomley Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/secretmem.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 030f02ddc7c1d..1fea68b8d5a6f 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -40,11 +41,11 @@ module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); -static atomic_t secretmem_users; +static refcount_t secretmem_users; bool secretmem_active(void) { - return !!atomic_read(&secretmem_users); + return !!refcount_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) @@ -103,7 +104,7 @@ static const struct vm_operations_struct secretmem_vm_ops = { static int secretmem_release(struct inode *inode, struct file *file) { - atomic_dec(&secretmem_users); + refcount_dec(&secretmem_users); return 0; } @@ -217,7 +218,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) file->f_flags |= O_LARGEFILE; fd_install(fd, file); - atomic_inc(&secretmem_users); + refcount_inc(&secretmem_users); return fd; err_put_fd: From d777ad50c842bedcd02098a83a301b3841723e41 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 24 Aug 2021 09:59:45 +1000 Subject: [PATCH 268/365] kfence: show cpu and timestamp in alloc/free info Record cpu and timestamp on allocations and frees, and show them in reports. Upon an error, this can help correlate earlier messages in the kernel log via allocation and free timestamps. Link: https://lkml.kernel.org/r/20210714175312.2947941-1-elver@google.com Suggested-by: Joern Engel Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Acked-by: Joern Engel Cc: Yuanyuan Zhong Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/dev-tools/kfence.rst | 98 ++++++++++++++++-------------- mm/kfence/core.c | 3 + mm/kfence/kfence.h | 2 + mm/kfence/report.c | 19 ++++-- 4 files changed, 71 insertions(+), 51 deletions(-) diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index fdf04e741ea57..0fbe3308bf37f 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -65,25 +65,27 @@ Error reports A typical out-of-bounds access looks like this:: ================================================================== - BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b + BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa6/0x234 - Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17): - test_out_of_bounds_read+0xa3/0x22b - kunit_try_run_case+0x51/0x85 + Out-of-bounds read at 0xffff8c3f2e291fff (1B left of kfence-#72): + test_out_of_bounds_read+0xa6/0x234 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - kfence-#17 [0xffffffffb672f000-0xffffffffb672f01f, size=32, cache=kmalloc-32] allocated by task 507: - test_alloc+0xf3/0x25b - test_out_of_bounds_read+0x98/0x22b - kunit_try_run_case+0x51/0x85 + kfence-#72: 0xffff8c3f2e292000-0xffff8c3f2e29201f, size=32, cache=kmalloc-32 + + allocated by task 484 on cpu 0 at 32.919330s: + test_alloc+0xfe/0x738 + test_out_of_bounds_read+0x9b/0x234 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - CPU: 4 PID: 107 Comm: kunit_try_catch Not tainted 5.8.0-rc6+ #7 - Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + CPU: 0 PID: 484 Comm: kunit_try_catch Not tainted 5.13.0-rc3+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 ================================================================== The header of the report provides a short summary of the function involved in @@ -96,30 +98,32 @@ Use-after-free accesses are reported as:: ================================================================== BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143 - Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24): + Use-after-free read at 0xffff8c3f2e2a0000 (in kfence-#79): test_use_after_free_read+0xb3/0x143 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - kfence-#24 [0xffffffffb673dfe0-0xffffffffb673dfff, size=32, cache=kmalloc-32] allocated by task 507: - test_alloc+0xf3/0x25b + kfence-#79: 0xffff8c3f2e2a0000-0xffff8c3f2e2a001f, size=32, cache=kmalloc-32 + + allocated by task 488 on cpu 2 at 33.871326s: + test_alloc+0xfe/0x738 test_use_after_free_read+0x76/0x143 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - freed by task 507: + freed by task 488 on cpu 2 at 33.871358s: test_use_after_free_read+0xa8/0x143 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - CPU: 4 PID: 109 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 - Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + CPU: 2 PID: 488 Comm: kunit_try_catch Tainted: G B 5.13.0-rc3+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 ================================================================== KFENCE also reports on invalid frees, such as double-frees:: @@ -127,30 +131,32 @@ KFENCE also reports on invalid frees, such as double-frees:: ================================================================== BUG: KFENCE: invalid free in test_double_free+0xdc/0x171 - Invalid free of 0xffffffffb6741000: + Invalid free of 0xffff8c3f2e2a4000 (in kfence-#81): test_double_free+0xdc/0x171 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - kfence-#26 [0xffffffffb6741000-0xffffffffb674101f, size=32, cache=kmalloc-32] allocated by task 507: - test_alloc+0xf3/0x25b + kfence-#81: 0xffff8c3f2e2a4000-0xffff8c3f2e2a401f, size=32, cache=kmalloc-32 + + allocated by task 490 on cpu 1 at 34.175321s: + test_alloc+0xfe/0x738 test_double_free+0x76/0x171 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - freed by task 507: + freed by task 490 on cpu 1 at 34.175348s: test_double_free+0xa8/0x171 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - CPU: 4 PID: 111 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 - Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + CPU: 1 PID: 490 Comm: kunit_try_catch Tainted: G B 5.13.0-rc3+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 ================================================================== KFENCE also uses pattern-based redzones on the other side of an object's guard @@ -160,23 +166,25 @@ These are reported on frees:: ================================================================== BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184 - Corrupted memory at 0xffffffffb6797ff9 [ 0xac . . . . . . ] (in kfence-#69): + Corrupted memory at 0xffff8c3f2e33aff9 [ 0xac . . . . . . ] (in kfence-#156): test_kmalloc_aligned_oob_write+0xef/0x184 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - kfence-#69 [0xffffffffb6797fb0-0xffffffffb6797ff8, size=73, cache=kmalloc-96] allocated by task 507: - test_alloc+0xf3/0x25b + kfence-#156: 0xffff8c3f2e33afb0-0xffff8c3f2e33aff8, size=73, cache=kmalloc-96 + + allocated by task 502 on cpu 7 at 42.159302s: + test_alloc+0xfe/0x738 test_kmalloc_aligned_oob_write+0x57/0x184 - kunit_try_run_case+0x51/0x85 + kunit_try_run_case+0x61/0xa0 kunit_generic_run_threadfn_adapter+0x16/0x30 - kthread+0x137/0x160 + kthread+0x176/0x1b0 ret_from_fork+0x22/0x30 - CPU: 4 PID: 120 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7 - Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 + CPU: 7 PID: 502 Comm: kunit_try_catch Tainted: G B 5.13.0-rc3+ #7 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 ================================================================== For such errors, the address where the corruption occurred as well as the diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 575c685aa6422..7a97db8bc8e75 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -196,6 +197,8 @@ static noinline void metadata_update_state(struct kfence_metadata *meta, */ track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); track->pid = task_pid_nr(current); + track->cpu = raw_smp_processor_id(); + track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ /* * Pairs with READ_ONCE() in diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 24065321ff8a7..c1f23c61e5f91 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -36,6 +36,8 @@ enum kfence_object_state { /* Alloc/free tracking information. */ struct kfence_track { pid_t pid; + int cpu; + u64 ts_nsec; int num_stack_entries; unsigned long stack_entries[KFENCE_STACK_DEPTH]; }; diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 2a319c21c939a..cbdd8d442d0bc 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -100,6 +101,13 @@ static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadat bool show_alloc) { const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; + u64 ts_sec = track->ts_nsec; + unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC); + + /* Timestamp matches printk timestamp format. */ + seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n", + show_alloc ? "allocated" : "freed", track->pid, + track->cpu, (unsigned long)ts_sec, rem_nsec / 1000); if (track->num_stack_entries) { /* Skip allocation/free internals stack. */ @@ -126,15 +134,14 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met return; } - seq_con_printf(seq, - "kfence-#%td [0x%p-0x%p" - ", size=%d, cache=%s] allocated by task %d:\n", - meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, - (cache && cache->name) ? cache->name : "", meta->alloc_track.pid); + seq_con_printf(seq, "kfence-#%td: 0x%p-0x%p, size=%d, cache=%s\n\n", + meta - kfence_metadata, (void *)start, (void *)(start + size - 1), + size, (cache && cache->name) ? cache->name : ""); + kfence_print_stack(seq, meta, true); if (meta->state == KFENCE_OBJECT_FREED) { - seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid); + seq_con_printf(seq, "\n"); kfence_print_stack(seq, meta, false); } } From 7a728f2e534e3f4e59353890ae781adac3990cf9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:45 +1000 Subject: [PATCH 269/365] mm: introduce Data Access MONitor (DAMON) Patch series "Introduce Data Access MONitor (DAMON)", v34. Introduction ============ DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON called 'region based sampling' and 'adaptive regions adjustment' (refer to 'mechanisms.rst' in the 11th patch of this patchset for the detail) make it - accurate (The monitored information is useful for DRAM level memory management. It might not appropriate for Cache-level accuracy, though.), - light-weight (The monitoring overhead is low enough to be applied online while making no impact on the performance of the target workloads.), and - scalable (the upper-bound of the instrumentation overhead is controllable regardless of the size of target workloads.). Using this framework, therefore, several memory management mechanisms such as reclamation and THP can be optimized to aware real data access patterns. Experimental access pattern aware memory management optimization works that incurring high instrumentation overhead will be able to have another try. Though DAMON is for kernel subsystems, it can be easily exposed to the user space by writing a DAMON-wrapper kernel subsystem. Then, user space users who have some special workloads will be able to write personalized tools or applications for deeper understanding and specialized optimizations of their systems. DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2]. [1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon The userspace tool[1] is available, released under GPLv2, and actively being maintained. I am also planning to implement another basic user interface in perf[2]. Also, the basic test suite for DAMON is available under GPLv2[3]. [1] https://github.com/awslabs/damo [2] https://lore.kernel.org/linux-mm/20210107120729.22328-1-sjpark@amazon.com/ [3] https://github.com/awslabs/damon-tests Long-term Plan -------------- DAMON is a part of a project called Data Access-aware Operating System (DAOS). As the name implies, I want to improve the performance and efficiency of systems using fine-grained data access patterns. The optimizations are for both kernel and user spaces. I will therefore modify or create kernel subsystems, export some of those to user space and implement user space library / tools. Below shows the layers and components for the project. --------------------------------------------------------------------------- Primitives: PTE Accessed bit, PG_idle, rmap, (Intel CMT), ... Framework: DAMON Features: DAMOS, virtual addr, physical addr, ... Applications: DAMON-debugfs, (DARC), ... ^^^^^^^^^^^^^^^^^^^^^^^ KERNEL SPACE ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Raw Interface: debugfs, (sysfs), (damonfs), tracepoints, (sys_damon), ... vvvvvvvvvvvvvvvvvvvvvvv USER SPACE vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv Library: (libdamon), ... Tools: DAMO, (perf), ... --------------------------------------------------------------------------- The components in parentheses or marked as '...' are not implemented yet but in the future plan. IOW, those are the TODO tasks of DAOS project. For more detail, please refer to the plans: https://lore.kernel.org/linux-mm/20201202082731.24828-1-sjpark@amazon.com/ Evaluations =========== We evaluated DAMON's overhead, monitoring quality and usefulness using 24 realistic workloads on my QEMU/KVM based virtual machine running a kernel that v24 DAMON patchset is applied. DAMON is lightweight. It increases system memory usage by 0.39% and slows target workloads down by 1.16%. DAMON is accurate and useful for memory management optimizations. An experimental DAMON-based operation scheme for THP, namely 'ethp', removes 76.15% of THP memory overheads while preserving 51.25% of THP speedup. Another experimental DAMON-based 'proactive reclamation' implementation, 'prcl', reduces 93.38% of residential sets and 23.63% of system memory footprint while incurring only 1.22% runtime overhead in the best case (parsec3/freqmine). NOTE that the experimental THP optimization and proactive reclamation are not for production but only for proof of concepts. Please refer to the official document[1] or "Documentation/admin-guide/mm: Add a document for DAMON" patch in this patchset for detailed evaluation setup and results. [1] https://damonitor.github.io/doc/html/latest-damon/admin-guide/mm/damon/eval.html Real-world User Story ===================== In summary, DAMON has used on production systems and proved its usefulness. DAMON as a profiler ------------------- We analyzed characteristics of a large scale production systems of our customers using DAMON. The systems utilize 70GB DRAM and 36 CPUs. From this, we were able to find interesting things below. There were obviously different access pattern under idle workload and active workload. Under the idle workload, it accessed large memory regions with low frequency, while the active workload accessed small memory regions with high freuqnecy. DAMON found a 7GB memory region that showing obviously high access frequency under the active workload. We believe this is the performance-effective working set and need to be protected. There was a 4KB memory region that showing highest access frequency under not only active but also idle workloads. We think this must be a hottest code section like thing that should never be paged out. For this analysis, DAMON used only 0.3-1% of single CPU time. Because we used recording-based analysis, it consumed about 3-12 MB of disk space per 20 minutes. This is only small amount of disk space, but we can further reduce the disk usage by using non-recording-based DAMON features. I'd like to argue that only DAMON can do such detailed analysis (finding 4KB highest region in 70GB memory) with the light overhead. DAMON as a system optimization tool ----------------------------------- We also found below potential performance problems on the systems and made DAMON-based solutions. The system doesn't want to make the workload suffer from the page reclamation and thus it utilizes enough DRAM but no swap device. However, we found the system is actively reclaiming file-backed pages, because the system has intensive file IO. The file IO turned out to be not performance critical for the workload, but the customer wanted to ensure performance critical file-backed pages like code section to not mistakenly be evicted. Using direct IO should or `mlock()` would be a straightforward solution, but modifying the user space code is not easy for the customer. Alternatively, we could use DAMON-based operation scheme[1]. By using it, we can ask DAMON to track access frequency of each region and make 'process_madvise(MADV_WILLNEED)[2]' call for regions having specific size and access frequency for a time interval. We also found the system is having high number of TLB misses. We tried 'always' THP enabled policy and it greatly reduced TLB misses, but the page reclamation also been more frequent due to the THP internal fragmentation caused memory bloat. We could try another DAMON-based operation scheme that applies 'MADV_HUGEPAGE' to memory regions having >=2MB size and high access frequency, while applying 'MADV_NOHUGEPAGE' to regions having <2MB size and low access frequency. We do not own the systems so we only reported the analysis results and possible optimization solutions to the customers. The customers satisfied about the analysis results and promised to try the optimization guides. [1] https://lore.kernel.org/linux-mm/20201006123931.5847-1-sjpark@amazon.com/ [2] https://lore.kernel.org/linux-api/20200622192900.22757-4-minchan@kernel.org/ Comparison with Idle Page Tracking ================================== Idle Page Tracking allows users to set and read idleness of pages using a bitmap file which represents each page with each bit of the file. One recommended usage of it is working set size detection. Users can do that by 1. find PFN of each page for workloads in interest, 2. set all the pages as idle by doing writes to the bitmap file, 3. wait until the workload accesses its working set, and 4. read the idleness of the pages again and count pages became not idle. NOTE: While Idle Page Tracking is for user space users, DAMON is primarily designed for kernel subsystems though it can easily exposed to the user space. Hence, this section only assumes such user space use of DAMON. For what use cases Idle Page Tracking would be better? ------------------------------------------------------ 1. Flexible usecases other than hotness monitoring. Because Idle Page Tracking allows users to control the primitive (Page idleness) by themselves, Idle Page Tracking users can do anything they want. Meanwhile, DAMON is primarily designed to monitor the hotness of each memory region. For this, DAMON asks users to provide sampling interval and aggregation interval. For the reason, there could be some use case that using Idle Page Tracking is simpler. 2. Physical memory monitoring. Idle Page Tracking receives PFN range as input, so natively supports physical memory monitoring. DAMON is designed to be extensible for multiple address spaces and use cases by implementing and using primitives for the given use case. Therefore, by theory, DAMON has no limitation in the type of target address space as long as primitives for the given address space exists. However, the default primitives introduced by this patchset supports only virtual address spaces. Therefore, for physical memory monitoring, you should implement your own primitives and use it, or simply use Idle Page Tracking. Nonetheless, RFC patchsets[1] for the physical memory address space primitives is already available. It also supports user memory same to Idle Page Tracking. [1] https://lore.kernel.org/linux-mm/20200831104730.28970-1-sjpark@amazon.com/ For what use cases DAMON is better? ----------------------------------- 1. Hotness Monitoring. Idle Page Tracking let users know only if a page frame is accessed or not. For hotness check, the user should write more code and use more memory. DAMON do that by itself. 2. Low Monitoring Overhead DAMON receives user's monitoring request with one step and then provide the results. So, roughly speaking, DAMON require only O(1) user/kernel context switches. In case of Idle Page Tracking, however, because the interface receives contiguous page frames, the number of user/kernel context switches increases as the monitoring target becomes complex and huge. As a result, the context switch overhead could be not negligible. Moreover, DAMON is born to handle with the monitoring overhead. Because the core mechanism is pure logical, Idle Page Tracking users might be able to implement the mechanism on their own, but it would be time consuming and the user/kernel context switching will still more frequent than that of DAMON. Also, the kernel subsystems cannot use the logic in this case. 3. Page granularity working set size detection. Until v22 of this patchset, this was categorized as the thing Idle Page Tracking could do better, because DAMON basically maintains additional metadata for each of the monitoring target regions. So, in the page granularity working set size detection use case, DAMON would incur (number of monitoring target pages * size of metadata) memory overhead. Size of the single metadata item is about 54 bytes, so assuming 4KB pages, about 1.3% of monitoring target pages will be additionally used. All essential metadata for Idle Page Tracking are embedded in 'struct page' and page table entries. Therefore, in this use case, only one counter variable for working set size accounting is required if Idle Page Tracking is used. There are more details to consider, but roughly speaking, this is true in most cases. However, the situation changed from v23. Now DAMON supports arbitrary types of monitoring targets, which don't use the metadata. Using that, DAMON can do the working set size detection with no additional space overhead but less user-kernel context switch. A first draft for the implementation of monitoring primitives for this usage is available in a DAMON development tree[1]. An RFC patchset for it based on this patchset will also be available soon. Since v24, the arbitrary type support is dropped from this patchset because this patchset doesn't introduce real use of the type. You can still get it from the DAMON development tree[2], though. [1] https://github.com/sjp38/linux/tree/damon/pgidle_hack [2] https://github.com/sjp38/linux/tree/damon/master 4. More future usecases While Idle Page Tracking has tight coupling with base primitives (PG_Idle and page table Accessed bits), DAMON is designed to be extensible for many use cases and address spaces. If you need some special address type or want to use special h/w access check primitives, you can write your own primitives for that and configure DAMON to use those. Therefore, if your use case could be changed a lot in future, using DAMON could be better. Can I use both Idle Page Tracking and DAMON? -------------------------------------------- Yes, though using them concurrently for overlapping memory regions could result in interference to each other. Nevertheless, such use case would be rare or makes no sense at all. Even in the case, the noise would bot be really significant. So, you can choose whatever you want depending on the characteristics of your use cases. More Information ================ We prepared a showcase web site[1] that you can get more information. There are - the official documentations[2], - the heatmap format dynamic access pattern of various realistic workloads for heap area[3], mmap()-ed area[4], and stack[5] area, - the dynamic working set size distribution[6] and chronological working set size changes[7], and - the latest performance test results[8]. [1] https://damonitor.github.io/_index [2] https://damonitor.github.io/doc/html/latest-damon [3] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.0.png.html [4] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html [5] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.2.png.html [6] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html [7] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html [8] https://damonitor.github.io/test/result/perf/latest/html/index.html Baseline and Complete Git Trees =============================== The patches are based on the latest -mm tree, specifically v5.14-rc1-mmots-2021-07-15-18-47 of https://github.com/hnaz/linux-mm. You can also clone the complete git tree: $ git clone git://github.com/sjp38/linux -b damon/patches/v34 The web is also available: https://github.com/sjp38/linux/releases/tag/damon/patches/v34 Development Trees ----------------- There are a couple of trees for entire DAMON patchset series and features for future release. - For latest release: https://github.com/sjp38/linux/tree/damon/master - For next release: https://github.com/sjp38/linux/tree/damon/next Long-term Support Trees ----------------------- For people who want to test DAMON but using LTS kernels, there are another couple of trees based on two latest LTS kernels respectively and containing the 'damon/master' backports. - For v5.4.y: https://github.com/sjp38/linux/tree/damon/for-v5.4.y - For v5.10.y: https://github.com/sjp38/linux/tree/damon/for-v5.10.y Amazon Linux Kernel Trees ------------------------- DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2]. [1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon Git Tree for Diff of Patches ============================ For easy review of diff between different versions of each patch, I prepared a git tree containing all versions of the DAMON patchset series: https://github.com/sjp38/damon-patches You can clone it and use 'diff' for easy review of changes between different versions of the patchset. For example: $ git clone https://github.com/sjp38/damon-patches && cd damon-patches $ diff -u damon/v33 damon/v34 Sequence Of Patches =================== First three patches implement the core logics of DAMON. The 1st patch introduces basic sampling based hotness monitoring for arbitrary types of targets. Following two patches implement the core mechanisms for control of overhead and accuracy, namely regions based sampling (patch 2) and adaptive regions adjustment (patch 3). Now the essential parts of DAMON is complete, but it cannot work unless someone provides monitoring primitives for a specific use case. The following two patches make it just work for virtual address spaces monitoring. The 4th patch makes 'PG_idle' can be used by DAMON and the 5th patch implements the virtual memory address space specific monitoring primitives using page table Accessed bits and the 'PG_idle' page flag. Now DAMON just works for virtual address space monitoring via the kernel space api. To let the user space users can use DAMON, following four patches add interfaces for them. The 6th patch adds a tracepoint for monitoring results. The 7th patch implements a DAMON application kernel module, namely damon-dbgfs, that simply wraps DAMON and exposes DAMON interface to the user space via the debugfs interface. The 8th patch further exports pid of monitoring thread (kdamond) to user space for easier cpu usage accounting, and the 9th patch makes the debugfs interface to support multiple contexts. Three patches for maintainability follows. The 10th patch adds documentations for both the user space and the kernel space. The 11th patch provides unit tests (based on the kunit) while the 12th patch adds user space tests (based on the kselftest). Finally, the last patch (13th) updates the MAINTAINERS file. This patch (of 13): DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON make it - accurate (the monitoring output is useful enough for DRAM level performance-centric memory management; It might be inappropriate for CPU cache levels, though), - light-weight (the monitoring overhead is normally low enough to be applied online), and - scalable (the upper-bound of the overhead is in constant range regardless of the size of target workloads). Using this framework, hence, we can easily write efficient kernel space data access monitoring applications. For example, the kernel's memory management mechanisms can make advanced decisions using this. Experimental data access aware optimization works that incurring high access monitoring overhead could again be implemented on top of this. Due to its simple and flexible interface, providing user space interface would be also easy. Then, user space users who have some special workloads can write personalized applications for better understanding and optimizations of their workloads and systems. === Nevertheless, this commit is defining and implementing only basic access check part without the overhead-accuracy handling core logic. The basic access check is as below. The output of DAMON says what memory regions are how frequently accessed for a given duration. The resolution of the access frequency is controlled by setting ``sampling interval`` and ``aggregation interval``. In detail, DAMON checks access to each page per ``sampling interval`` and aggregates the results. In other words, counts the number of the accesses to each region. After each ``aggregation interval`` passes, DAMON calls callback functions that previously registered by users so that users can read the aggregated results and then clears the results. This can be described in below simple pseudo-code:: init() while monitoring_on: for page in monitoring_target: if accessed(page): nr_accesses[page] += 1 if time() % aggregation_interval == 0: for callback in user_registered_callbacks: callback(monitoring_target, nr_accesses) for page in monitoring_target: nr_accesses[page] = 0 if time() % update_interval == 0: update() sleep(sampling interval) The target regions constructed at the beginning of the monitoring and updated after each ``regions_update_interval``, because the target regions could be dynamically changed (e.g., mmap() or memory hotplug). The monitoring overhead of this mechanism will arbitrarily increase as the size of the target workload grows. The basic monitoring primitives for actual access check and dynamic target regions construction aren't in the core part of DAMON. Instead, it allows users to implement their own primitives that are optimized for their use case and configure DAMON to use those. In other words, users cannot use current version of DAMON without some additional works. Following commits will implement the core mechanisms for the overhead-accuracy control and default primitives implementations. Link: https://lkml.kernel.org/r/20210716081449.22187-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-2-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Jonathan Cameron Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Joe Perches Cc: Mel Gorman Cc: Maximilian Heyne Cc: Minchan Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: David Rientjes Cc: Steven Rostedt (VMware) Cc: Shuah Khan Cc: Vlastimil Babka Cc: Vladimir Davydov Cc: Brendan Higgins Cc: Markus Boehme Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 167 ++++++++++++++++++++++ mm/Kconfig | 2 + mm/Makefile | 1 + mm/damon/Kconfig | 15 ++ mm/damon/Makefile | 3 + mm/damon/core.c | 320 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 508 insertions(+) create mode 100644 include/linux/damon.h create mode 100644 mm/damon/Kconfig create mode 100644 mm/damon/Makefile create mode 100644 mm/damon/core.c diff --git a/include/linux/damon.h b/include/linux/damon.h new file mode 100644 index 0000000000000..2f652602b1eaa --- /dev/null +++ b/include/linux/damon.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON api + * + * Author: SeongJae Park + */ + +#ifndef _DAMON_H_ +#define _DAMON_H_ + +#include +#include +#include + +struct damon_ctx; + +/** + * struct damon_primitive Monitoring primitives for given use cases. + * + * @init: Initialize primitive-internal data structures. + * @update: Update primitive-internal data structures. + * @prepare_access_checks: Prepare next access check of target regions. + * @check_accesses: Check the accesses to target regions. + * @reset_aggregated: Reset aggregated accesses monitoring results. + * @target_valid: Determine if the target is valid. + * @cleanup: Clean up the context. + * + * DAMON can be extended for various address spaces and usages. For this, + * users should register the low level primitives for their target address + * space and usecase via the &damon_ctx.primitive. Then, the monitoring thread + * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting + * the monitoring, @update after each &damon_ctx.primitive_update_interval, and + * @check_accesses, @target_valid and @prepare_access_checks after each + * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each + * &damon_ctx.aggr_interval. + * + * @init should initialize primitive-internal data structures. For example, + * this could be used to construct proper monitoring target regions and link + * those to @damon_ctx.target. + * @update should update the primitive-internal data structures. For example, + * this could be used to update monitoring target regions for current status. + * @prepare_access_checks should manipulate the monitoring regions to be + * prepared for the next access check. + * @check_accesses should check the accesses to each region that made after the + * last preparation and update the number of observed accesses of each region. + * @reset_aggregated should reset the access monitoring results that aggregated + * by @check_accesses. + * @target_valid should check whether the target is still valid for the + * monitoring. + * @cleanup is called from @kdamond just before its termination. + */ +struct damon_primitive { + void (*init)(struct damon_ctx *context); + void (*update)(struct damon_ctx *context); + void (*prepare_access_checks)(struct damon_ctx *context); + void (*check_accesses)(struct damon_ctx *context); + void (*reset_aggregated)(struct damon_ctx *context); + bool (*target_valid)(void *target); + void (*cleanup)(struct damon_ctx *context); +}; + +/* + * struct damon_callback Monitoring events notification callbacks. + * + * @before_start: Called before starting the monitoring. + * @after_sampling: Called after each sampling. + * @after_aggregation: Called after each aggregation. + * @before_terminate: Called before terminating the monitoring. + * @private: User private data. + * + * The monitoring thread (&damon_ctx.kdamond) calls @before_start and + * @before_terminate just before starting and finishing the monitoring, + * respectively. Therefore, those are good places for installing and cleaning + * @private. + * + * The monitoring thread calls @after_sampling and @after_aggregation for each + * of the sampling intervals and aggregation intervals, respectively. + * Therefore, users can safely access the monitoring results without additional + * protection. For the reason, users are recommended to use these callback for + * the accesses to the results. + * + * If any callback returns non-zero, monitoring stops. + */ +struct damon_callback { + void *private; + + int (*before_start)(struct damon_ctx *context); + int (*after_sampling)(struct damon_ctx *context); + int (*after_aggregation)(struct damon_ctx *context); + int (*before_terminate)(struct damon_ctx *context); +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. + * + * @sample_interval: The time between access samplings. + * @aggr_interval: The time between monitor results aggregations. + * @primitive_update_interval: The time between monitoring primitive updates. + * + * For each @sample_interval, DAMON checks whether each region is accessed or + * not. It aggregates and keeps the access information (number of accesses to + * each region) for @aggr_interval time. DAMON also checks whether the target + * memory regions need update (e.g., by ``mmap()`` calls from the application, + * in case of virtual memory monitoring) and applies the changes for each + * @primitive_update_interval. All time intervals are in micro-seconds. + * Please refer to &struct damon_primitive and &struct damon_callback for more + * detail. + * + * @kdamond: Kernel thread who does the monitoring. + * @kdamond_stop: Notifies whether kdamond should stop. + * @kdamond_lock: Mutex for the synchronizations with @kdamond. + * + * For each monitoring context, one kernel thread for the monitoring is + * created. The pointer to the thread is stored in @kdamond. + * + * Once started, the monitoring thread runs until explicitly required to be + * terminated or every monitoring target is invalid. The validity of the + * targets is checked via the &damon_primitive.target_valid of @primitive. The + * termination can also be explicitly requested by writing non-zero to + * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. + * Therefore, users can know whether the monitoring is ongoing or terminated by + * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from + * outside of the monitoring thread must be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond and @kdamond_stop via + * @kdamond_lock. Accesses to other fields must be protected by themselves. + * + * @primitive: Set of monitoring primitives for given use cases. + * @callback: Set of callbacks for monitoring events notifications. + * + * @target: Pointer to the user-defined monitoring target. + */ +struct damon_ctx { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long primitive_update_interval; + +/* private: internal use only */ + struct timespec64 last_aggregation; + struct timespec64 last_primitive_update; + +/* public: */ + struct task_struct *kdamond; + bool kdamond_stop; + struct mutex kdamond_lock; + + struct damon_primitive primitive; + struct damon_callback callback; + + void *target; +}; + +#ifdef CONFIG_DAMON + +struct damon_ctx *damon_new_ctx(void); +void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int); + +int damon_start(struct damon_ctx **ctxs, int nr_ctxs); +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); + +#endif /* CONFIG_DAMON */ + +#endif /* _DAMON_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 1f9bd3371765b..504336de9a1e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -886,4 +886,6 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +source "mm/damon/Kconfig" + endmenu diff --git a/mm/Makefile b/mm/Makefile index e3436741d5391..709674b13497e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -128,3 +128,4 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o +obj-$(CONFIG_DAMON) += damon/ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig new file mode 100644 index 0000000000000..d00e99ac1a154 --- /dev/null +++ b/mm/damon/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Data Access Monitoring" + +config DAMON + bool "DAMON: Data Access Monitoring Framework" + help + This builds a framework that allows kernel subsystems to monitor + access frequency of each memory region. The information can be useful + for performance-centric DRAM level memory management. + + See https://damonitor.github.io/doc/html/latest-damon/index.html for + more information. + +endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile new file mode 100644 index 0000000000000..4fd2edb4becfb --- /dev/null +++ b/mm/damon/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_DAMON) := core.o diff --git a/mm/damon/core.c b/mm/damon/core.c new file mode 100644 index 0000000000000..651590bf49b17 --- /dev/null +++ b/mm/damon/core.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Access Monitor + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon: " fmt + +#include +#include +#include +#include + +static DEFINE_MUTEX(damon_lock); +static int nr_running_ctxs; + +struct damon_ctx *damon_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->sample_interval = 5 * 1000; + ctx->aggr_interval = 100 * 1000; + ctx->primitive_update_interval = 60 * 1000 * 1000; + + ktime_get_coarse_ts64(&ctx->last_aggregation); + ctx->last_primitive_update = ctx->last_aggregation; + + mutex_init(&ctx->kdamond_lock); + + ctx->target = NULL; + + return ctx; +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + kfree(ctx); +} + +/** + * damon_set_attrs() - Set attributes for the monitoring. + * @ctx: monitoring context + * @sample_int: time interval between samplings + * @aggr_int: time interval between aggregations + * @primitive_upd_int: time interval between monitoring primitive updates + * + * This function should not be called while the kdamond is running. + * Every time interval is in micro-seconds. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int) +{ + ctx->sample_interval = sample_int; + ctx->aggr_interval = aggr_int; + ctx->primitive_update_interval = primitive_upd_int; + + return 0; +} + +static bool damon_kdamond_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + + return running; +} + +static int kdamond_fn(void *data); + +/* + * __damon_start() - Starts monitoring with given context. + * @ctx: monitoring context + * + * This function should be called while damon_lock is hold. + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_start(struct damon_ctx *ctx) +{ + int err = -EBUSY; + + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) { + err = 0; + ctx->kdamond_stop = false; + ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", + nr_running_ctxs); + if (IS_ERR(ctx->kdamond)) { + err = PTR_ERR(ctx->kdamond); + ctx->kdamond = 0; + } + } + mutex_unlock(&ctx->kdamond_lock); + + return err; +} + +/** + * damon_start() - Starts the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to start monitoring + * @nr_ctxs: size of @ctxs + * + * This function starts a group of monitoring threads for a group of monitoring + * contexts. One thread per each context is created and run in parallel. The + * caller should handle synchronization between the threads by itself. If a + * group of threads that created by other 'damon_start()' call is currently + * running, this function does nothing but returns -EBUSY. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_start(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i; + int err = 0; + + mutex_lock(&damon_lock); + if (nr_running_ctxs) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + + for (i = 0; i < nr_ctxs; i++) { + err = __damon_start(ctxs[i]); + if (err) + break; + nr_running_ctxs++; + } + mutex_unlock(&damon_lock); + + return err; +} + +/* + * __damon_stop() - Stops monitoring of given context. + * @ctx: monitoring context + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); + while (damon_kdamond_running(ctx)) + usleep_range(ctx->sample_interval, + ctx->sample_interval * 2); + return 0; + } + mutex_unlock(&ctx->kdamond_lock); + + return -EPERM; +} + +/** + * damon_stop() - Stops the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to stop monitoring + * @nr_ctxs: size of @ctxs + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i, err = 0; + + for (i = 0; i < nr_ctxs; i++) { + /* nr_running_ctxs is decremented in kdamond_fn */ + err = __damon_stop(ctxs[i]); + if (err) + return err; + } + + return err; +} + +/* + * damon_check_reset_time_interval() - Check if a time interval is elapsed. + * @baseline: the time to check whether the interval has elapsed since + * @interval: the time interval (microseconds) + * + * See whether the given time interval has passed since the given baseline + * time. If so, it also updates the baseline to current time for next check. + * + * Return: true if the time interval has passed, or false otherwise. + */ +static bool damon_check_reset_time_interval(struct timespec64 *baseline, + unsigned long interval) +{ + struct timespec64 now; + + ktime_get_coarse_ts64(&now); + if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < + interval * 1000) + return false; + *baseline = now; + return true; +} + +/* + * Check whether it is time to flush the aggregated information + */ +static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_aggregation, + ctx->aggr_interval); +} + +/* + * Check whether it is time to check and apply the target monitoring regions + * + * Returns true if it is. + */ +static bool kdamond_need_update_primitive(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_primitive_update, + ctx->primitive_update_interval); +} + +/* + * Check whether current monitoring should be stopped + * + * The monitoring is stopped when either the user requested to stop, or all + * monitoring targets are invalid. + * + * Returns true if need to stop current monitoring. + */ +static bool kdamond_need_stop(struct damon_ctx *ctx) +{ + bool stop; + + mutex_lock(&ctx->kdamond_lock); + stop = ctx->kdamond_stop; + mutex_unlock(&ctx->kdamond_lock); + if (stop) + return true; + + if (!ctx->primitive.target_valid) + return false; + + return !ctx->primitive.target_valid(ctx->target); +} + +static void set_kdamond_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); +} + +/* + * The monitoring daemon that runs as a kernel thread + */ +static int kdamond_fn(void *data) +{ + struct damon_ctx *ctx = (struct damon_ctx *)data; + + mutex_lock(&ctx->kdamond_lock); + pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); + mutex_unlock(&ctx->kdamond_lock); + + if (ctx->primitive.init) + ctx->primitive.init(ctx); + if (ctx->callback.before_start && ctx->callback.before_start(ctx)) + set_kdamond_stop(ctx); + + while (!kdamond_need_stop(ctx)) { + if (ctx->primitive.prepare_access_checks) + ctx->primitive.prepare_access_checks(ctx); + if (ctx->callback.after_sampling && + ctx->callback.after_sampling(ctx)) + set_kdamond_stop(ctx); + + usleep_range(ctx->sample_interval, ctx->sample_interval + 1); + + if (ctx->primitive.check_accesses) + ctx->primitive.check_accesses(ctx); + + if (kdamond_aggregate_interval_passed(ctx)) { + if (ctx->callback.after_aggregation && + ctx->callback.after_aggregation(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.reset_aggregated) + ctx->primitive.reset_aggregated(ctx); + } + + if (kdamond_need_update_primitive(ctx)) { + if (ctx->primitive.update) + ctx->primitive.update(ctx); + } + } + + if (ctx->callback.before_terminate && + ctx->callback.before_terminate(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + + pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond = NULL; + mutex_unlock(&ctx->kdamond_lock); + + mutex_lock(&damon_lock); + nr_running_ctxs--; + mutex_unlock(&damon_lock); + + do_exit(0); +} From 0fed6776414ec586f04f4ea5da5f47e1e51a8be1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:46 +1000 Subject: [PATCH 270/365] mm/damon/core: implement region-based sampling To avoid the unbounded increase of the overhead, DAMON groups adjacent pages that are assumed to have the same access frequencies into a region. As long as the assumption (pages in a region have the same access frequencies) is kept, only one page in the region is required to be checked. Thus, for each ``sampling interval``, 1. the 'prepare_access_checks' primitive picks one page in each region, 2. waits for one ``sampling interval``, 3. checks whether the page is accessed meanwhile, and 4. increases the access count of the region if so. Therefore, the monitoring overhead is controllable by adjusting the number of regions. DAMON allows both the underlying primitives and user callbacks to adjust regions for the trade-off. In other words, this commit makes DAMON to use not only time-based sampling but also space-based sampling. This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. Next commit will address this problem. Link: https://lkml.kernel.org/r/20210716081449.22187-3-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 77 ++++++++++++++++++++++- mm/damon/core.c | 143 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 213 insertions(+), 7 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2f652602b1eaa..67db309ad61be 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,48 @@ #include #include +/** + * struct damon_addr_range - Represents an address region of [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_addr_range { + unsigned long start; + unsigned long end; +}; + +/** + * struct damon_region - Represents a monitoring target region. + * @ar: The address range of the region. + * @sampling_addr: Address of the sample for the next access check. + * @nr_accesses: Access frequency of this region. + * @list: List head for siblings. + */ +struct damon_region { + struct damon_addr_range ar; + unsigned long sampling_addr; + unsigned int nr_accesses; + struct list_head list; +}; + +/** + * struct damon_target - Represents a monitoring target. + * @id: Unique identifier for this target. + * @regions_list: Head of the monitoring target regions of this target. + * @list: List head for siblings. + * + * Each monitoring context could have multiple targets. For example, a context + * for virtual memory address spaces could have multiple target processes. The + * @id of each target should be unique among the targets of the context. For + * example, in the virtual address monitoring context, it could be a pidfd or + * an address of an mm_struct. + */ +struct damon_target { + unsigned long id; + struct list_head regions_list; + struct list_head list; +}; + struct damon_ctx; /** @@ -36,7 +78,7 @@ struct damon_ctx; * * @init should initialize primitive-internal data structures. For example, * this could be used to construct proper monitoring target regions and link - * those to @damon_ctx.target. + * those to @damon_ctx.adaptive_targets. * @update should update the primitive-internal data structures. For example, * this could be used to update monitoring target regions for current status. * @prepare_access_checks should manipulate the monitoring regions to be @@ -130,7 +172,7 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @target: Pointer to the user-defined monitoring target. + * @region_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -149,11 +191,40 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback; - void *target; + struct list_head region_targets; }; +#define damon_next_region(r) \ + (container_of(r->list.next, struct damon_region, list)) + +#define damon_prev_region(r) \ + (container_of(r->list.prev, struct damon_region, list)) + +#define damon_for_each_region(r, t) \ + list_for_each_entry(r, &t->regions_list, list) + +#define damon_for_each_region_safe(r, next, t) \ + list_for_each_entry_safe(r, next, &t->regions_list, list) + +#define damon_for_each_target(t, ctx) \ + list_for_each_entry(t, &(ctx)->region_targets, list) + +#define damon_for_each_target_safe(t, next, ctx) \ + list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + #ifdef CONFIG_DAMON +struct damon_region *damon_new_region(unsigned long start, unsigned long end); +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next); +void damon_add_region(struct damon_region *r, struct damon_target *t); +void damon_destroy_region(struct damon_region *r); + +struct damon_target *damon_new_target(unsigned long id); +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +void damon_free_target(struct damon_target *t); +void damon_destroy_target(struct damon_target *t); + struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, diff --git a/mm/damon/core.c b/mm/damon/core.c index 651590bf49b17..947486a150ce2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -15,6 +15,101 @@ static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; +/* + * Construct a damon_region struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_region *damon_new_region(unsigned long start, unsigned long end) +{ + struct damon_region *region; + + region = kmalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return NULL; + + region->ar.start = start; + region->ar.end = end; + region->nr_accesses = 0; + INIT_LIST_HEAD(®ion->list); + + return region; +} + +/* + * Add a region between two other regions + */ +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next) +{ + __list_add(&r->list, &prev->list, &next->list); +} + +void damon_add_region(struct damon_region *r, struct damon_target *t) +{ + list_add_tail(&r->list, &t->regions_list); +} + +static void damon_del_region(struct damon_region *r) +{ + list_del(&r->list); +} + +static void damon_free_region(struct damon_region *r) +{ + kfree(r); +} + +void damon_destroy_region(struct damon_region *r) +{ + damon_del_region(r); + damon_free_region(r); +} + +/* + * Construct a damon_target struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_target *damon_new_target(unsigned long id) +{ + struct damon_target *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->id = id; + INIT_LIST_HEAD(&t->regions_list); + + return t; +} + +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) +{ + list_add_tail(&t->list, &ctx->region_targets); +} + +static void damon_del_target(struct damon_target *t) +{ + list_del(&t->list); +} + +void damon_free_target(struct damon_target *t) +{ + struct damon_region *r, *next; + + damon_for_each_region_safe(r, next, t) + damon_free_region(r); + kfree(t); +} + +void damon_destroy_target(struct damon_target *t) +{ + damon_del_target(t); + damon_free_target(t); +} + struct damon_ctx *damon_new_ctx(void) { struct damon_ctx *ctx; @@ -32,15 +127,27 @@ struct damon_ctx *damon_new_ctx(void) mutex_init(&ctx->kdamond_lock); - ctx->target = NULL; + INIT_LIST_HEAD(&ctx->region_targets); return ctx; } -void damon_destroy_ctx(struct damon_ctx *ctx) +static void damon_destroy_targets(struct damon_ctx *ctx) { - if (ctx->primitive.cleanup) + struct damon_target *t, *next_t; + + if (ctx->primitive.cleanup) { ctx->primitive.cleanup(ctx); + return; + } + + damon_for_each_target_safe(t, next_t, ctx) + damon_destroy_target(t); +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_targets(ctx); kfree(ctx); } @@ -217,6 +324,21 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) ctx->aggr_interval); } +/* + * Reset the aggregated monitoring results ('nr_accesses' of each region). + */ +static void kdamond_reset_aggregated(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + + damon_for_each_region(r, t) + r->nr_accesses = 0; + } +} + /* * Check whether it is time to check and apply the target monitoring regions * @@ -238,6 +360,7 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx) */ static bool kdamond_need_stop(struct damon_ctx *ctx) { + struct damon_target *t; bool stop; mutex_lock(&ctx->kdamond_lock); @@ -249,7 +372,12 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) if (!ctx->primitive.target_valid) return false; - return !ctx->primitive.target_valid(ctx->target); + damon_for_each_target(t, ctx) { + if (ctx->primitive.target_valid(t)) + return false; + } + + return true; } static void set_kdamond_stop(struct damon_ctx *ctx) @@ -265,6 +393,8 @@ static void set_kdamond_stop(struct damon_ctx *ctx) static int kdamond_fn(void *data) { struct damon_ctx *ctx = (struct damon_ctx *)data; + struct damon_target *t; + struct damon_region *r, *next; mutex_lock(&ctx->kdamond_lock); pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); @@ -291,6 +421,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); + kdamond_reset_aggregated(ctx); if (ctx->primitive.reset_aggregated) ctx->primitive.reset_aggregated(ctx); } @@ -300,6 +431,10 @@ static int kdamond_fn(void *data) ctx->primitive.update(ctx); } } + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r); + } if (ctx->callback.before_terminate && ctx->callback.before_terminate(ctx)) From 1650c4e6e5d36a997ee62f7f4bf13577c1e5744f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:46 +1000 Subject: [PATCH 271/365] mm/damon: adaptively adjust regions Even somehow the initial monitoring target regions are well constructed to fulfill the assumption (pages in same region have similar access frequencies), the data access pattern can be dynamically changed. This will result in low monitoring quality. To keep the assumption as much as possible, DAMON adaptively merges and splits each region based on their access frequency. For each ``aggregation interval``, it compares the access frequencies of adjacent regions and merges those if the frequency difference is small. Then, after it reports and clears the aggregated access frequency of each region, it splits each region into two or three regions if the total number of regions will not exceed the user-specified maximum number of regions after the split. In this way, DAMON provides its best-effort quality and minimal overhead while keeping the upper-bound overhead that users set. Link: https://lkml.kernel.org/r/20210716081449.22187-4-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 30 ++++-- mm/damon/core.c | 224 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 237 insertions(+), 17 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 67db309ad61be..ce2a84b26cd74 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,9 @@ #include #include +/* Minimal region size. Every damon_region is aligned by this. */ +#define DAMON_MIN_REGION PAGE_SIZE + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -39,6 +42,7 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. * @id: Unique identifier for this target. + * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * @@ -50,6 +54,7 @@ struct damon_region { */ struct damon_target { unsigned long id; + unsigned int nr_regions; struct list_head regions_list; struct list_head list; }; @@ -85,6 +90,8 @@ struct damon_ctx; * prepared for the next access check. * @check_accesses should check the accesses to each region that made after the * last preparation and update the number of observed accesses of each region. + * It should also return max number of observed accesses that made as a result + * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. * @target_valid should check whether the target is still valid for the @@ -95,7 +102,7 @@ struct damon_primitive { void (*init)(struct damon_ctx *context); void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); - void (*check_accesses)(struct damon_ctx *context); + unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); @@ -172,7 +179,9 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @region_targets: Head of monitoring targets (&damon_target) list. + * @min_nr_regions: The minimum number of adaptive monitoring regions. + * @max_nr_regions: The maximum number of adaptive monitoring regions. + * @adaptive_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -191,7 +200,9 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback; - struct list_head region_targets; + unsigned long min_nr_regions; + unsigned long max_nr_regions; + struct list_head adaptive_targets; }; #define damon_next_region(r) \ @@ -207,28 +218,31 @@ struct damon_ctx { list_for_each_entry_safe(r, next, &t->regions_list, list) #define damon_for_each_target(t, ctx) \ - list_for_each_entry(t, &(ctx)->region_targets, list) + list_for_each_entry(t, &(ctx)->adaptive_targets, list) #define damon_for_each_target_safe(t, next, ctx) \ - list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next); + struct damon_region *prev, struct damon_region *next, + struct damon_target *t); void damon_add_region(struct damon_region *r, struct damon_target *t); -void damon_destroy_region(struct damon_region *r); +void damon_destroy_region(struct damon_region *r, struct damon_target *t); struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); +unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int); + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/core.c b/mm/damon/core.c index 947486a150ce2..28a2c78914faa 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,8 +10,12 @@ #include #include #include +#include #include +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; @@ -40,19 +44,23 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) * Add a region between two other regions */ inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next) + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) { __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; } void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); + t->nr_regions++; } -static void damon_del_region(struct damon_region *r) +static void damon_del_region(struct damon_region *r, struct damon_target *t) { list_del(&r->list); + t->nr_regions--; } static void damon_free_region(struct damon_region *r) @@ -60,9 +68,9 @@ static void damon_free_region(struct damon_region *r) kfree(r); } -void damon_destroy_region(struct damon_region *r) +void damon_destroy_region(struct damon_region *r, struct damon_target *t) { - damon_del_region(r); + damon_del_region(r, t); damon_free_region(r); } @@ -80,6 +88,7 @@ struct damon_target *damon_new_target(unsigned long id) return NULL; t->id = id; + t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); return t; @@ -87,7 +96,7 @@ struct damon_target *damon_new_target(unsigned long id) void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) { - list_add_tail(&t->list, &ctx->region_targets); + list_add_tail(&t->list, &ctx->adaptive_targets); } static void damon_del_target(struct damon_target *t) @@ -110,6 +119,11 @@ void damon_destroy_target(struct damon_target *t) damon_free_target(t); } +unsigned int damon_nr_regions(struct damon_target *t) +{ + return t->nr_regions; +} + struct damon_ctx *damon_new_ctx(void) { struct damon_ctx *ctx; @@ -127,7 +141,10 @@ struct damon_ctx *damon_new_ctx(void) mutex_init(&ctx->kdamond_lock); - INIT_LIST_HEAD(&ctx->region_targets); + ctx->min_nr_regions = 10; + ctx->max_nr_regions = 1000; + + INIT_LIST_HEAD(&ctx->adaptive_targets); return ctx; } @@ -157,6 +174,8 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * @sample_int: time interval between samplings * @aggr_int: time interval between aggregations * @primitive_upd_int: time interval between monitoring primitive updates + * @min_nr_reg: minimal number of regions + * @max_nr_reg: maximum number of regions * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. @@ -164,15 +183,49 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * Return: 0 on success, negative error code otherwise. */ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int) + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg) { + if (min_nr_reg < 3) { + pr_err("min_nr_regions (%lu) must be at least 3\n", + min_nr_reg); + return -EINVAL; + } + if (min_nr_reg > max_nr_reg) { + pr_err("invalid nr_regions. min (%lu) > max (%lu)\n", + min_nr_reg, max_nr_reg); + return -EINVAL; + } + ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; ctx->primitive_update_interval = primitive_upd_int; + ctx->min_nr_regions = min_nr_reg; + ctx->max_nr_regions = max_nr_reg; return 0; } +/* Returns the size upper limit for each monitoring region */ +static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + sz += r->ar.end - r->ar.start; + } + + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + return sz; +} + static bool damon_kdamond_running(struct damon_ctx *ctx) { bool running; @@ -339,6 +392,150 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } +#define sz_damon_region(r) (r->ar.end - r->ar.start) + +/* + * Merge two adjacent regions into one region + */ +static void damon_merge_two_regions(struct damon_target *t, + struct damon_region *l, struct damon_region *r) +{ + unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + + l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / + (sz_l + sz_r); + l->ar.end = r->ar.end; + damon_destroy_region(r, t); +} + +#define diff_of(a, b) (a > b ? a - b : b - a) + +/* + * Merge adjacent regions having similar access frequencies + * + * t target affected by this merge operation + * thres '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + */ +static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, + unsigned long sz_limit) +{ + struct damon_region *r, *prev = NULL, *next; + + damon_for_each_region_safe(r, next, t) { + if (prev && prev->ar.end == r->ar.start && + diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_merge_two_regions(t, prev, r); + else + prev = r; + } +} + +/* + * Merge adjacent regions having similar access frequencies + * + * threshold '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + * + * This function merges monitoring target regions which are adjacent and their + * access frequencies are similar. This is for minimizing the monitoring + * overhead under the dynamically changeable access pattern. If a merge was + * unnecessarily made, later 'kdamond_split_regions()' will revert it. + */ +static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, + unsigned long sz_limit) +{ + struct damon_target *t; + + damon_for_each_target(t, c) + damon_merge_regions_of(t, threshold, sz_limit); +} + +/* + * Split a region in two + * + * r the region to be split + * sz_r size of the first sub-region that will be made + */ +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r) +{ + struct damon_region *new; + + new = damon_new_region(r->ar.start + sz_r, r->ar.end); + if (!new) + return; + + r->ar.end = new->ar.start; + + damon_insert_region(new, r, damon_next_region(r), t); +} + +/* Split every region in the given target into 'nr_subs' regions */ +static void damon_split_regions_of(struct damon_ctx *ctx, + struct damon_target *t, int nr_subs) +{ + struct damon_region *r, *next; + unsigned long sz_region, sz_sub = 0; + int i; + + damon_for_each_region_safe(r, next, t) { + sz_region = r->ar.end - r->ar.start; + + for (i = 0; i < nr_subs - 1 && + sz_region > 2 * DAMON_MIN_REGION; i++) { + /* + * Randomly select size of left sub-region to be at + * least 10 percent and at most 90% of original region + */ + sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_region / 10, DAMON_MIN_REGION); + /* Do not allow blank region */ + if (sz_sub == 0 || sz_sub >= sz_region) + continue; + + damon_split_region_at(ctx, t, r, sz_sub); + sz_region = sz_sub; + } + } +} + +/* + * Split every target region into randomly-sized small regions + * + * This function splits every target region into random-sized small regions if + * current total number of the regions is equal or smaller than half of the + * user-specified maximum number of regions. This is for maximizing the + * monitoring accuracy under the dynamically changeable access patterns. If a + * split was unnecessarily made, later 'kdamond_merge_regions()' will revert + * it. + */ +static void kdamond_split_regions(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_regions = 0; + static unsigned int last_nr_regions; + int nr_subregions = 2; + + damon_for_each_target(t, ctx) + nr_regions += damon_nr_regions(t); + + if (nr_regions > ctx->max_nr_regions / 2) + return; + + /* Maybe the middle of the region has different access frequency */ + if (last_nr_regions == nr_regions && + nr_regions < ctx->max_nr_regions / 3) + nr_subregions = 3; + + damon_for_each_target(t, ctx) + damon_split_regions_of(ctx, t, nr_subregions); + + last_nr_regions = nr_regions; +} + /* * Check whether it is time to check and apply the target monitoring regions * @@ -395,6 +592,8 @@ static int kdamond_fn(void *data) struct damon_ctx *ctx = (struct damon_ctx *)data; struct damon_target *t; struct damon_region *r, *next; + unsigned int max_nr_accesses = 0; + unsigned long sz_limit = 0; mutex_lock(&ctx->kdamond_lock); pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); @@ -405,6 +604,8 @@ static int kdamond_fn(void *data) if (ctx->callback.before_start && ctx->callback.before_start(ctx)) set_kdamond_stop(ctx); + sz_limit = damon_region_sz_limit(ctx); + while (!kdamond_need_stop(ctx)) { if (ctx->primitive.prepare_access_checks) ctx->primitive.prepare_access_checks(ctx); @@ -415,13 +616,17 @@ static int kdamond_fn(void *data) usleep_range(ctx->sample_interval, ctx->sample_interval + 1); if (ctx->primitive.check_accesses) - ctx->primitive.check_accesses(ctx); + max_nr_accesses = ctx->primitive.check_accesses(ctx); if (kdamond_aggregate_interval_passed(ctx)) { + kdamond_merge_regions(ctx, + max_nr_accesses / 10, + sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); kdamond_reset_aggregated(ctx); + kdamond_split_regions(ctx); if (ctx->primitive.reset_aggregated) ctx->primitive.reset_aggregated(ctx); } @@ -429,11 +634,12 @@ static int kdamond_fn(void *data) if (kdamond_need_update_primitive(ctx)) { if (ctx->primitive.update) ctx->primitive.update(ctx); + sz_limit = damon_region_sz_limit(ctx); } } damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) - damon_destroy_region(r); + damon_destroy_region(r, t); } if (ctx->callback.before_terminate && From 317bb7c778abe8d40bf81039e60df3bbcd416e8d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:46 +1000 Subject: [PATCH 272/365] mm/idle_page_tracking: make PG_idle reusable PG_idle and PG_young allow the two PTE Accessed bit users, Idle Page Tracking and the reclaim logic concurrently work while not interfering with each other. That is, when they need to clear the Accessed bit, they set PG_young to represent the previous state of the bit, respectively. And when they need to read the bit, if the bit is cleared, they further read the PG_young to know whether the other has cleared the bit meanwhile or not. For yet another user of the PTE Accessed bit, we could add another page flag, or extend the mechanism to use the flags. For the DAMON usecase, however, we don't need to do that just yet. IDLE_PAGE_TRACKING and DAMON are mutually exclusive, so there's only ever going to be one user of the current set of flags. In this commit, we split out the CONFIG options to allow for the use of PG_young and PG_idle outside of idle page tracking. In the next commit, DAMON's reference implementation of the virtual memory address space monitoring primitives will use it. Link: https://lkml.kernel.org/r/20210716081449.22187-5-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/page-flags.h | 4 ++-- include/linux/page_ext.h | 2 +- include/linux/page_idle.h | 6 +++--- include/trace/events/mmflags.h | 2 +- mm/Kconfig | 8 ++++++++ mm/page_ext.c | 12 +++++++++++- mm/page_idle.c | 10 ---------- 7 files changed, 26 insertions(+), 18 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 62dc61db9165a..b34486156926e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -131,7 +131,7 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif @@ -441,7 +441,7 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index aff81ba31bd8b..fabb2e1e087f4 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,7 +19,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 1e894d34bdceb..d8a6aecf99cb9 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -6,7 +6,7 @@ #include #include -#ifdef CONFIG_IDLE_PAGE_TRACKING +#ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT static inline bool page_is_young(struct page *page) @@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page) } #endif /* CONFIG_64BIT */ -#else /* !CONFIG_IDLE_PAGE_TRACKING */ +#else /* !CONFIG_PAGE_IDLE_FLAG */ static inline bool page_is_young(struct page *page) { @@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page) { } -#endif /* CONFIG_IDLE_PAGE_TRACKING */ +#endif /* CONFIG_PAGE_IDLE_FLAG */ #endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index f160484afc5ce..a26dbefdf294b 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -75,7 +75,7 @@ #define IF_HAVE_PG_HWPOISON(flag,string) #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) #define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string} #else #define IF_HAVE_PG_IDLE(flag,string) diff --git a/mm/Kconfig b/mm/Kconfig index 504336de9a1e0..d0b85dc124291 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -739,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT lifetime of the system until these kthreads finish the initialisation. +config PAGE_IDLE_FLAG + bool "Add PG_idle and PG_young flags" + help + This feature adds PG_idle and PG_young flags in 'struct page'. PTE + Accessed bit writers can set the state of the bit in the flags to let + other PTE Accessed bit readers don't disturbed. + config IDLE_PAGE_TRACKING bool "Enable idle page tracking" depends on SYSFS && MMU && BROKEN select PAGE_EXTENSION if !64BIT + select PAGE_IDLE_FLAG help This feature allows to estimate the amount of user pages that have not been touched during a given period of time. This information can diff --git a/mm/page_ext.c b/mm/page_ext.c index 293b2685fc489..dfb91653d359e 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -58,11 +58,21 @@ * can utilize this callback to initialize the state of it correctly. */ +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif }; diff --git a/mm/page_idle.c b/mm/page_idle.c index 64e5344a992ca..edead6a8a5f91 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -207,16 +207,6 @@ static const struct attribute_group page_idle_attr_group = { .name = "page_idle", }; -#ifndef CONFIG_64BIT -static bool need_page_idle(void) -{ - return true; -} -struct page_ext_operations page_idle_ops = { - .need = need_page_idle, -}; -#endif - static int __init page_idle_init(void) { int err; From 1ba27cffb7ee69632e74e813156e315947a09211 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:46 +1000 Subject: [PATCH 273/365] mm/PAGE_IDLE_FLAG: Set PAGE_EXTENSION for none-64BIT Commit 128fd80c4c07 ("mm/idle_page_tracking: Make PG_idle reusable") of linux-mm[1] allows PAGE_IDLE_FLAG to be set without PAGE_EXTENSION while 64BIT is not set. This makes 'enum page_ext_flags' undefined, so build fails as below for the config (!64BIT, !PAGE_EXTENSION, and IDLE_PAGE_FLAG). $ make ARCH=i386 allnoconfig $ echo 'CONFIG_PAGE_IDLE_FLAG=y' >> .config $ make olddefconfig $ make ARCH=i386 [...] ../include/linux/page_idle.h: In function `folio_test_young': ../include/linux/page_idle.h:25:18: error: `PAGE_EXT_YOUNG' undeclared (first use in this function); did you mean `PAGEOUTRUN'? return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); [...] This commit fixes this issue by making PAGE_EXTENSION to be set when 64BIT is not set and PAGE_IDLE_FLAG is set. [1] https://github.com/hnaz/linux-mm/commit/128fd80c4c07 Link: https://lkml.kernel.org/r/20210806095153.6444-1-sj38.park@gmail.com Fixes: 128fd80c4c07 ("mm/idle_page_tracking: Make PG_idle reusable") Signed-off-by: SeongJae Park Reported-by: Randy Dunlap Acked-by: Randy Dunlap Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index d0b85dc124291..50ca602edeb61 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -741,6 +741,7 @@ config DEFERRED_STRUCT_PAGE_INIT config PAGE_IDLE_FLAG bool "Add PG_idle and PG_young flags" + select PAGE_EXTENSION if !64BIT help This feature adds PG_idle and PG_young flags in 'struct page'. PTE Accessed bit writers can set the state of the bit in the flags to let @@ -749,7 +750,6 @@ config PAGE_IDLE_FLAG config IDLE_PAGE_TRACKING bool "Enable idle page tracking" depends on SYSFS && MMU && BROKEN - select PAGE_EXTENSION if !64BIT select PAGE_IDLE_FLAG help This feature allows to estimate the amount of user pages that have From 4c1f13fd41112e00ff3b17d1cd72c3dcd68f9f62 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:46 +1000 Subject: [PATCH 274/365] mm-idle_page_tracking-make-pg_idle-reusable-fix-fix tweak Kconfig text Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 50ca602edeb61..8d3a005edea5d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG bool "Add PG_idle and PG_young flags" select PAGE_EXTENSION if !64BIT help - This feature adds PG_idle and PG_young flags in 'struct page'. PTE - Accessed bit writers can set the state of the bit in the flags to let - other PTE Accessed bit readers don't disturbed. + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed + bit writers can set the state of the bit in the flags so that PTE + Accessed bit readers may avoid disturbance. config IDLE_PAGE_TRACKING bool "Enable idle page tracking" From 78c85147b72d7626e312c0115b0f407de1fdf83a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:46 +1000 Subject: [PATCH 275/365] mm/damon/Kconfig: Hide PAGE_IDLE_FLAG from users Commit 2a058a1a9914 ("mm/idle_page_tracking: make PG_idle reusable") of linux-mm[1] makes CONFIG_PAGE_IDLE_FLAG option to be presented to the user. However, the option is hard to be understood by users. Also, it is not intended to be set by users but other kernel subsystems like DAMON or IDLE_PAGE_TRACKING. To avoid confusions, this commit removes the prompt so that the option is not presented to the user. [1] https://github.com/hnaz/linux-mm/commit/2a058a1a9914 Link: https://lkml.kernel.org/r/20210813081238.34705-1-sj38.park@gmail.com Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Acked-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 8d3a005edea5d..be726f61ef9bc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -740,7 +740,7 @@ config DEFERRED_STRUCT_PAGE_INIT initialisation. config PAGE_IDLE_FLAG - bool "Add PG_idle and PG_young flags" + bool select PAGE_EXTENSION if !64BIT help This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed From bc3f00f696805f924a19446f711730f389fb30a3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:47 +1000 Subject: [PATCH 276/365] mm/damon: implement primitives for the virtual memory address spaces This commit introduces a reference implementation of the address space specific low level primitives for the virtual address space, so that users of DAMON can easily monitor the data accesses on virtual address spaces of specific processes by simply configuring the implementation to be used by DAMON. The low level primitives for the fundamental access monitoring are defined in two parts: 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. The reference implementation for the virtual address space does the works as below. PTE Accessed-bit Based Access Check ----------------------------------- The implementation uses PTE Accessed-bit for basic access checks. That is, it clears the bit for the next sampling target page and checks whether it is set again after one sampling period. This could disturb the reclaim logic. DAMON uses ``PG_idle`` and ``PG_young`` page flags to solve the conflict, as Idle page tracking does. VMA-based Target Address Range Construction ------------------------------------------- Only small parts in the super-huge virtual address space of the processes are mapped to physical memory and accessed. Thus, tracking the unmapped address regions is just wasteful. However, because DAMON can deal with some level of noise using the adaptive regions adjustment mechanism, tracking every mapping is not strictly required but could even incur a high overhead in some cases. That said, too huge unmapped areas inside the monitoring target should be removed to not take the time for the adaptive mechanism. For the reason, this implementation converts the complex mappings to three distinct regions that cover every mapped area of the address space. Also, the two gaps between the three regions are the two biggest unmapped areas in the given address space. The two biggest unmapped areas would be the gap between the heap and the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed region and the stack in most of the cases. Because these gaps are exceptionally huge in usual address spaces, excluding these will be sufficient to make a reasonable trade-off. Below shows this in detail:: (small mmap()-ed regions and munmap()-ed regions) Link: https://lkml.kernel.org/r/20210716081449.22187-6-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 13 + mm/damon/Kconfig | 9 + mm/damon/Makefile | 1 + mm/damon/vaddr.c | 606 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 629 insertions(+) create mode 100644 mm/damon/vaddr.c diff --git a/include/linux/damon.h b/include/linux/damon.h index ce2a84b26cd74..edb350e52b934 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -249,4 +249,17 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ +#ifdef CONFIG_DAMON_VADDR + +/* Monitoring primitives for virtual memory address spaces */ +void damon_va_init(struct damon_ctx *ctx); +void damon_va_update(struct damon_ctx *ctx); +void damon_va_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_va_check_accesses(struct damon_ctx *ctx); +bool damon_va_target_valid(void *t); +void damon_va_cleanup(struct damon_ctx *ctx); +void damon_va_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_VADDR */ + #endif /* _DAMON_H */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d00e99ac1a154..8ae080c52950e 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,4 +12,13 @@ config DAMON See https://damonitor.github.io/doc/html/latest-damon/index.html for more information. +config DAMON_VADDR + bool "Data access monitoring primitives for virtual address spaces" + depends on DAMON && MMU + select PAGE_EXTENSION if !64BIT + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for virtual address spaces. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 4fd2edb4becfb..6ebbd08aed673 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAMON) := core.o +obj-$(CONFIG_DAMON_VADDR) += vaddr.o diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c new file mode 100644 index 0000000000000..91d4a95cedf74 --- /dev/null +++ b/mm/damon/vaddr.c @@ -0,0 +1,606 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for Virtual Address Spaces + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-va: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +/* + * 't->id' should be the pointer to the relevant 'struct pid' having reference + * count. Caller must put the returned task, unless it is NULL. + */ +#define damon_get_task_struct(t) \ + (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) + +/* + * Get the mm_struct of the given target + * + * Caller _must_ put the mm_struct after use, unless it is NULL. + * + * Returns the mm_struct of the target on success, NULL on failure + */ +static struct mm_struct *damon_get_mm(struct damon_target *t) +{ + struct task_struct *task; + struct mm_struct *mm; + + task = damon_get_task_struct(t); + if (!task) + return NULL; + + mm = get_task_mm(task); + put_task_struct(task); + return mm; +} + +/* + * Functions for the initial monitoring target regions construction + */ + +/* + * Size-evenly split a region into 'nr_pieces' small regions + * + * Returns 0 on success, or negative error code otherwise. + */ +static int damon_va_evenly_split_region(struct damon_target *t, + struct damon_region *r, unsigned int nr_pieces) +{ + unsigned long sz_orig, sz_piece, orig_end; + struct damon_region *n = NULL, *next; + unsigned long start; + + if (!r || !nr_pieces) + return -EINVAL; + + orig_end = r->ar.end; + sz_orig = r->ar.end - r->ar.start; + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + + if (!sz_piece) + return -EINVAL; + + r->ar.end = r->ar.start + sz_piece; + next = damon_next_region(r); + for (start = r->ar.end; start + sz_piece <= orig_end; + start += sz_piece) { + n = damon_new_region(start, start + sz_piece); + if (!n) + return -ENOMEM; + damon_insert_region(n, r, next, t); + r = n; + } + /* complement last region for possible rounding error */ + if (n) + n->ar.end = orig_end; + + return 0; +} + +static unsigned long sz_range(struct damon_addr_range *r) +{ + return r->end - r->start; +} + +static void swap_ranges(struct damon_addr_range *r1, + struct damon_addr_range *r2) +{ + struct damon_addr_range tmp; + + tmp = *r1; + *r1 = *r2; + *r2 = tmp; +} + +/* + * Find three regions separated by two biggest unmapped regions + * + * vma the head vma of the target address space + * regions an array of three address ranges that results will be saved + * + * This function receives an address space and finds three regions in it which + * separated by the two biggest unmapped regions in the space. Please refer to + * below comments of '__damon_va_init_regions()' function to know why this is + * necessary. + * + * Returns 0 if success, or negative error code otherwise. + */ +static int __damon_va_three_regions(struct vm_area_struct *vma, + struct damon_addr_range regions[3]) +{ + struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; + struct vm_area_struct *last_vma = NULL; + unsigned long start = 0; + struct rb_root rbroot; + + /* Find two biggest gaps so that first_gap > second_gap > others */ + for (; vma; vma = vma->vm_next) { + if (!last_vma) { + start = vma->vm_start; + goto next; + } + + if (vma->rb_subtree_gap <= sz_range(&second_gap)) { + rbroot.rb_node = &vma->vm_rb; + vma = rb_entry(rb_last(&rbroot), + struct vm_area_struct, vm_rb); + goto next; + } + + gap.start = last_vma->vm_end; + gap.end = vma->vm_start; + if (sz_range(&gap) > sz_range(&second_gap)) { + swap_ranges(&gap, &second_gap); + if (sz_range(&second_gap) > sz_range(&first_gap)) + swap_ranges(&second_gap, &first_gap); + } +next: + last_vma = vma; + } + + if (!sz_range(&second_gap) || !sz_range(&first_gap)) + return -EINVAL; + + /* Sort the two biggest gaps by address */ + if (first_gap.start > second_gap.start) + swap_ranges(&first_gap, &second_gap); + + /* Store the result */ + regions[0].start = ALIGN(start, DAMON_MIN_REGION); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); + regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + + return 0; +} + +/* + * Get the three regions in the given target (task) + * + * Returns 0 on success, negative error code otherwise. + */ +static int damon_va_three_regions(struct damon_target *t, + struct damon_addr_range regions[3]) +{ + struct mm_struct *mm; + int rc; + + mm = damon_get_mm(t); + if (!mm) + return -EINVAL; + + mmap_read_lock(mm); + rc = __damon_va_three_regions(mm->mmap, regions); + mmap_read_unlock(mm); + + mmput(mm); + return rc; +} + +/* + * Initialize the monitoring target regions for the given target (task) + * + * t the given target + * + * Because only a number of small portions of the entire address space + * is actually mapped to the memory and accessed, monitoring the unmapped + * regions is wasteful. That said, because we can deal with small noises, + * tracking every mapping is not strictly required but could even incur a high + * overhead if the mapping frequently changes or the number of mappings is + * high. The adaptive regions adjustment mechanism will further help to deal + * with the noise by simply identifying the unmapped areas as a region that + * has no access. Moreover, applying the real mappings that would have many + * unmapped areas inside will make the adaptive mechanism quite complex. That + * said, too huge unmapped areas inside the monitoring target should be removed + * to not take the time for the adaptive mechanism. + * + * For the reason, we convert the complex mappings to three distinct regions + * that cover every mapped area of the address space. Also the two gaps + * between the three regions are the two biggest unmapped areas in the given + * address space. In detail, this function first identifies the start and the + * end of the mappings and the two biggest unmapped areas of the address space. + * Then, it constructs the three regions as below: + * + * [mappings[0]->start, big_two_unmapped_areas[0]->start) + * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) + * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) + * + * As usual memory map of processes is as below, the gap between the heap and + * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed + * region and the stack will be two biggest unmapped regions. Because these + * gaps are exceptionally huge areas in usual address space, excluding these + * two biggest unmapped regions will be sufficient to make a trade-off. + * + * + * + * + * (other mmap()-ed regions and small unmapped regions) + * + * + * + */ +static void __damon_va_init_regions(struct damon_ctx *ctx, + struct damon_target *t) +{ + struct damon_region *r; + struct damon_addr_range regions[3]; + unsigned long sz = 0, nr_pieces; + int i; + + if (damon_va_three_regions(t, regions)) { + pr_err("Failed to get three regions of target %lu\n", t->id); + return; + } + + for (i = 0; i < 3; i++) + sz += regions[i].end - regions[i].start; + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + /* Set the initial three regions of the target */ + for (i = 0; i < 3; i++) { + r = damon_new_region(regions[i].start, regions[i].end); + if (!r) { + pr_err("%d'th init region creation failed\n", i); + return; + } + damon_add_region(r, t); + + nr_pieces = (regions[i].end - regions[i].start) / sz; + damon_va_evenly_split_region(t, r, nr_pieces); + } +} + +/* Initialize '->regions_list' of every target (task) */ +void damon_va_init(struct damon_ctx *ctx) +{ + struct damon_target *t; + + damon_for_each_target(t, ctx) { + /* the user may set the target regions as they want */ + if (!damon_nr_regions(t)) + __damon_va_init_regions(ctx, t); + } +} + +/* + * Functions for the dynamic monitoring target regions update + */ + +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * Update damon regions for the three big regions of the given target + * + * t the given target + * bregions the three big regions of the target + */ +static void damon_va_apply_three_regions(struct damon_target *t, + struct damon_addr_range bregions[3]) +{ + struct damon_region *r, *next; + unsigned int i = 0; + + /* Remove regions which are not in the three big regions now */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < 3; i++) { + if (damon_intersect(r, &bregions[i])) + break; + } + if (i == 3) + damon_destroy_region(r, t); + } + + /* Adjust intersecting regions to fit with the three big regions */ + for (i = 0; i < 3; i++) { + struct damon_region *first = NULL, *last; + struct damon_region *newr; + struct damon_addr_range *br; + + br = &bregions[i]; + /* Get the first and last regions which intersects with br */ + damon_for_each_region(r, t) { + if (damon_intersect(r, br)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= br->end) + break; + } + if (!first) { + /* no damon_region intersects with this big region */ + newr = damon_new_region( + ALIGN_DOWN(br->start, + DAMON_MIN_REGION), + ALIGN(br->end, DAMON_MIN_REGION)); + if (!newr) + continue; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + first->ar.start = ALIGN_DOWN(br->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(br->end, DAMON_MIN_REGION); + } + } +} + +/* + * Update regions for current memory mappings + */ +void damon_va_update(struct damon_ctx *ctx) +{ + struct damon_addr_range three_regions[3]; + struct damon_target *t; + + damon_for_each_target(t, ctx) { + if (damon_va_three_regions(t, three_regions)) + continue; + damon_va_apply_three_regions(t, three_regions); + } +} + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +static struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, + unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, + unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) +{ + pte_t *pte = NULL; + pmd_t *pmd = NULL; + spinlock_t *ptl; + + if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl)) + return; + + if (pte) { + damon_ptep_mkold(pte, mm, addr); + pte_unmap_unlock(pte, ptl); + } else { + damon_pmdp_mkold(pmd, mm, addr); + spin_unlock(ptl); + } +} + +/* + * Functions for the access checking of the regions + */ + +static void damon_va_prepare_access_check(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_va_mkold(mm, r->sampling_addr); +} + +void damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) + damon_va_prepare_access_check(ctx, mm, r); + mmput(mm); + } +} + +static bool damon_va_young(struct mm_struct *mm, unsigned long addr, + unsigned long *page_sz) +{ + pte_t *pte = NULL; + pmd_t *pmd = NULL; + spinlock_t *ptl; + struct page *page; + bool young = false; + + if (follow_invalidate_pte(mm, addr, NULL, &pte, &pmd, &ptl)) + return false; + + *page_sz = PAGE_SIZE; + if (pte) { + page = damon_get_page(pte_pfn(*pte)); + if (page && (pte_young(*pte) || !page_is_idle(page) || + mmu_notifier_test_young(mm, addr))) + young = true; + if (page) + put_page(page); + pte_unmap_unlock(pte, ptl); + return young; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page = damon_get_page(pmd_pfn(*pmd)); + if (page && (pmd_young(*pmd) || !page_is_idle(page) || + mmu_notifier_test_young(mm, addr))) + young = true; + if (page) + put_page(page); + + spin_unlock(ptl); + *page_sz = ((1UL) << HPAGE_PMD_SHIFT); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + return young; +} + +/* + * Check whether the region was accessed after the last preparation + * + * mm 'mm_struct' for the given virtual address space + * r the region to be checked + */ +static void damon_va_check_access(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + static struct mm_struct *last_mm; + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_mm = mm; + last_addr = r->sampling_addr; +} + +unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) { + damon_va_check_access(ctx, mm, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + mmput(mm); + } + + return max_nr_accesses; +} + +/* + * Functions for the target validity check and cleanup + */ + +bool damon_va_target_valid(void *target) +{ + struct damon_target *t = target; + struct task_struct *task; + + task = damon_get_task_struct(t); + if (task) { + put_task_struct(task); + return true; + } + + return false; +} + +void damon_va_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = damon_va_init; + ctx->primitive.update = damon_va_update; + ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; + ctx->primitive.check_accesses = damon_va_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_va_target_valid; + ctx->primitive.cleanup = NULL; +} From 7291c1e5c2469737525bde3f663be214232111ef Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:47 +1000 Subject: [PATCH 277/365] mm-damon-implement-primitives-for-the-virtual-memory-address-spaces-fix mm/damon/vaddr.c needs highmem.h for kunmap_atomic() Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/vaddr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 91d4a95cedf74..e61211152615a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include From 9bcc195536deaf4c038ae78186a474578d57a9ec Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:47 +1000 Subject: [PATCH 278/365] mm/damon/Kconfig: Remove unnecessary PAGE_EXTENSION setup Commit 13d49dbd0123 ("mm/damon: implement primitives for the virtual memory address spaces") of linux-mm[1] makes DAMON_VADDR to set PAGE_IDLE_FLAG. PAGE_IDLE_FLAG sets PAGE_EXTENSION if !64BIT. However, DAMON_VADDR do the same work again. This commit removes the unnecessary duplicate. [1] https://github.com/hnaz/linux-mm/commit/13d49dbd0123 Link: https://lkml.kernel.org/r/20210806095153.6444-2-sj38.park@gmail.com Fixes: 13d49dbd0123 ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: SeongJae Park Reported-by: Randy Dunlap Acked-by: Randy Dunlap Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 8ae080c52950e..5cbb5db541587 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -15,7 +15,6 @@ config DAMON config DAMON_VADDR bool "Data access monitoring primitives for virtual address spaces" depends on DAMON && MMU - select PAGE_EXTENSION if !64BIT select PAGE_IDLE_FLAG help This builds the default data access monitoring primitives for DAMON From e64e46b1688df8e05dd0eeaa07479067073b21e3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:47 +1000 Subject: [PATCH 279/365] mm/damon: add a tracepoint This commit adds a tracepoint for DAMON. It traces the monitoring results of each region for each aggregation interval. Using this, DAMON can easily integrated with tracepoints supporting tools such as perf. Link: https://lkml.kernel.org/r/20210716081449.22187-7-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/trace/events/damon.h | 43 ++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 7 +++++- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/damon.h diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h new file mode 100644 index 0000000000000..2f422f4f1fb9e --- /dev/null +++ b/include/trace/events/damon.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM damon + +#if !defined(_TRACE_DAMON_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DAMON_H + +#include +#include +#include + +TRACE_EVENT(damon_aggregated, + + TP_PROTO(struct damon_target *t, struct damon_region *r, + unsigned int nr_regions), + + TP_ARGS(t, r, nr_regions), + + TP_STRUCT__entry( + __field(unsigned long, target_id) + __field(unsigned int, nr_regions) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, nr_accesses) + ), + + TP_fast_assign( + __entry->target_id = t->id; + __entry->nr_regions = nr_regions; + __entry->start = r->ar.start; + __entry->end = r->ar.end; + __entry->nr_accesses = r->nr_accesses; + ), + + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u", + __entry->target_id, __entry->nr_regions, + __entry->start, __entry->end, __entry->nr_accesses) +); + +#endif /* _TRACE_DAMON_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/damon/core.c b/mm/damon/core.c index 28a2c78914faa..ee24d64e8019e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -13,6 +13,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* Get a random number in [l, r) */ #define damon_rand(l, r) (l + prandom_u32_max(r - l)) @@ -387,8 +390,10 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_target(t, c) { struct damon_region *r; - damon_for_each_region(r, t) + damon_for_each_region(r, t) { + trace_damon_aggregated(t, r, damon_nr_regions(t)); r->nr_accesses = 0; + } } } From 55cc7f81a6d02d8f515133947ec00f340fd4334e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:47 +1000 Subject: [PATCH 280/365] mm/damon: implement a debugfs-based user space interface DAMON is designed to be used by kernel space code such as the memory management subsystems, and therefore it provides only kernel space API. That said, letting the user space control DAMON could provide some benefits to them. For example, it will allow user space to analyze their specific workloads and make their own special optimizations. For such cases, this commit implements a simple DAMON application kernel module, namely 'damon-dbgfs', which merely wraps the DAMON api and exports those to the user space via the debugfs. 'damon-dbgfs' exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under its debugfs directory, ``/damon/``. Attributes ---------- Users can read and write the ``sampling interval``, ``aggregation interval``, ``regions update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10, 1000 and check it again:: # cd /damon # echo 5000 100000 1000000 10 1000 > attrs # cat attrs 5000 100000 1000000 10 1000 Target IDs ---------- Some types of address spaces supports multiple monitoring target. For example, the virtual memory address spaces monitoring can have multiple processes as the monitoring targets. Users can set the targets by writing relevant id values of the targets to, and get the ids of the current targets by reading from the ``target_ids`` file. In case of the virtual address spaces monitoring, the values should be pids of the monitoring target processes. For example, below commands set processes having pids 42 and 4242 as the monitoring targets and check it again:: # cd /damon # echo 42 4242 > target_ids # cat target_ids 42 4242 Note that setting the target ids doesn't start the monitoring. Turning On/Off -------------- Setting the files as described above doesn't incur effect unless you explicitly start the monitoring. You can start, stop, and check the current status of the monitoring by writing to and reading from the ``monitor_on`` file. Writing ``on`` to the file starts the monitoring of the targets with the attributes. Writing ``off`` to the file stops those. DAMON also stops if every targets are invalidated (in case of the virtual memory monitoring, target processes are invalidated when terminated). Below example commands turn on, off, and check the status of DAMON:: # cd /damon # echo on > monitor_on # echo off > monitor_on # cat monitor_on off Please note that you cannot write to the above-mentioned debugfs files while the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned. Link: https://lkml.kernel.org/r/20210716081449.22187-8-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 3 + mm/damon/Kconfig | 9 + mm/damon/Makefile | 1 + mm/damon/core.c | 47 +++++ mm/damon/dbgfs.c | 398 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 458 insertions(+) create mode 100644 mm/damon/dbgfs.c diff --git a/include/linux/damon.h b/include/linux/damon.h index edb350e52b934..d68b67b8d458d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -240,9 +240,12 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 5cbb5db541587..c8e3dba6fb4cf 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -20,4 +20,13 @@ config DAMON_VADDR This builds the default data access monitoring primitives for DAMON that works for virtual address spaces. +config DAMON_DBGFS + bool "DAMON debugfs interface" + depends on DAMON_VADDR && DEBUG_FS + help + This builds the debugfs interface for DAMON. The user space admins + can use the interface for arbitrary data access monitoring. + + If unsure, say N. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 6ebbd08aed673..fed4be3bace3e 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_DAMON) := core.o obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o diff --git a/mm/damon/core.c b/mm/damon/core.c index ee24d64e8019e..59033488402e8 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -171,6 +171,39 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +/** + * damon_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @ids: array of target ids + * @nr_ids: number of entries in @ids + * + * This function should not be called while the kdamond is running. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_destroy_targets(ctx); + + for (i = 0; i < nr_ids; i++) { + t = damon_new_target(ids[i]); + if (!t) { + pr_err("Failed to alloc damon_target\n"); + /* The caller should do cleanup of the ids itself */ + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + return -ENOMEM; + } + damon_add_target(ctx, t); + } + + return 0; +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -209,6 +242,20 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, return 0; } +/** + * damon_nr_running_ctxs() - Return number of currently running contexts. + */ +int damon_nr_running_ctxs(void) +{ + int nr_ctxs; + + mutex_lock(&damon_lock); + nr_ctxs = nr_running_ctxs; + mutex_unlock(&damon_lock); + + return nr_ctxs; +} + /* Returns the size upper limit for each monitoring region */ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) { diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c new file mode 100644 index 0000000000000..322f624710d7c --- /dev/null +++ b/mm/damon/dbgfs.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Debugfs Interface + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-dbgfs: " fmt + +#include +#include +#include +#include +#include +#include +#include + +static struct damon_ctx **dbgfs_ctxs; +static int dbgfs_nr_ctxs; +static struct dentry **dbgfs_dirs; + +/* + * Returns non-empty string on success, negative error code otherwise. + */ +static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + + /* We do not accept continuous write */ + if (*ppos) + return ERR_PTR(-EINVAL); + + kbuf = kmalloc(count + 1, GFP_KERNEL); + if (!kbuf) + return ERR_PTR(-ENOMEM); + + ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); + if (ret != count) { + kfree(kbuf); + return ERR_PTR(-EIO); + } + kbuf[ret] = '\0'; + + return kbuf; +} + +static ssize_t dbgfs_attrs_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char kbuf[128]; + int ret; + + mutex_lock(&ctx->kdamond_lock); + ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", + ctx->sample_interval, ctx->aggr_interval, + ctx->primitive_update_interval, ctx->min_nr_regions, + ctx->max_nr_regions); + mutex_unlock(&ctx->kdamond_lock); + + return simple_read_from_buffer(buf, count, ppos, kbuf, ret); +} + +static ssize_t dbgfs_attrs_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + unsigned long s, a, r, minr, maxr; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (sscanf(kbuf, "%lu %lu %lu %lu %lu", + &s, &a, &r, &minr, &maxr) != 5) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (err) + ret = err; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +out: + kfree(kbuf); + return ret; +} + +#define targetid_is_pid(ctx) \ + (ctx->primitive.target_valid == damon_va_target_valid) + +static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) +{ + struct damon_target *t; + unsigned long id; + int written = 0; + int rc; + + damon_for_each_target(t, ctx) { + id = t->id; + if (targetid_is_pid(ctx)) + /* Show pid numbers to debugfs users */ + id = (unsigned long)pid_vnr((struct pid *)id); + + rc = scnprintf(&buf[written], len - written, "%lu ", id); + if (!rc) + return -ENOMEM; + written += rc; + } + if (written) + written -= 1; + written += scnprintf(&buf[written], len - written, "\n"); + return written; +} + +static ssize_t dbgfs_target_ids_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + ssize_t len; + char ids_buf[320]; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_target_ids(ctx, ids_buf, 320); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + return len; + + return simple_read_from_buffer(buf, count, ppos, ids_buf, len); +} + +/* + * Converts a string into an array of unsigned long integers + * + * Returns an array of unsigned long integers if the conversion success, or + * NULL otherwise. + */ +static unsigned long *str_to_target_ids(const char *str, ssize_t len, + ssize_t *nr_ids) +{ + unsigned long *ids; + const int max_nr_ids = 32; + unsigned long id; + int pos = 0, parsed, ret; + + *nr_ids = 0; + ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); + if (!ids) + return NULL; + while (*nr_ids < max_nr_ids && pos < len) { + ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + pos += parsed; + if (ret != 1) + break; + ids[*nr_ids] = id; + *nr_ids += 1; + } + + return ids; +} + +static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +{ + int i; + + for (i = 0; i < nr_ids; i++) + put_pid((struct pid *)ids[i]); +} + +static ssize_t dbgfs_target_ids_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf, *nrs; + unsigned long *targets; + ssize_t nr_targets; + ssize_t ret = count; + int i; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + nrs = kbuf; + + targets = str_to_target_ids(nrs, ret, &nr_targets); + if (!targets) { + ret = -ENOMEM; + goto out; + } + + if (targetid_is_pid(ctx)) { + for (i = 0; i < nr_targets; i++) { + targets[i] = (unsigned long)find_get_pid( + (int)targets[i]); + if (!targets[i]) { + dbgfs_put_pids(targets, i); + ret = -EINVAL; + goto free_targets_out; + } + } + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_targets(ctx, targets, nr_targets); + if (err) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = err; + } + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +free_targets_out: + kfree(targets); +out: + kfree(kbuf); + return ret; +} + +static int damon_dbgfs_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + + return nonseekable_open(inode, file); +} + +static const struct file_operations attrs_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_attrs_read, + .write = dbgfs_attrs_write, +}; + +static const struct file_operations target_ids_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_target_ids_read, + .write = dbgfs_target_ids_write, +}; + +static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) +{ + const char * const file_names[] = {"attrs", "target_ids"}; + const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops}; + int i; + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); +} + +static int dbgfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!targetid_is_pid(ctx)) + return 0; + + damon_for_each_target_safe(t, next, ctx) { + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } + return 0; +} + +static struct damon_ctx *dbgfs_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + + damon_va_set_primitives(ctx); + ctx->callback.before_terminate = dbgfs_before_terminate; + return ctx; +} + +static ssize_t dbgfs_monitor_on_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char monitor_on_buf[5]; + bool monitor_on = damon_nr_running_ctxs() != 0; + int len; + + len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); + + return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); +} + +static ssize_t dbgfs_monitor_on_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t ret = count; + char *kbuf; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Remove white space */ + if (sscanf(kbuf, "%s", kbuf) != 1) { + kfree(kbuf); + return -EINVAL; + } + + if (!strncmp(kbuf, "on", count)) + err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + else if (!strncmp(kbuf, "off", count)) + err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + else + err = -EINVAL; + + if (err) + ret = err; + kfree(kbuf); + return ret; +} + +static const struct file_operations monitor_on_fops = { + .read = dbgfs_monitor_on_read, + .write = dbgfs_monitor_on_write, +}; + +static int __init __damon_dbgfs_init(void) +{ + struct dentry *dbgfs_root; + const char * const file_names[] = {"monitor_on"}; + const struct file_operations *fops[] = {&monitor_on_fops}; + int i; + + dbgfs_root = debugfs_create_dir("damon", NULL); + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, + fops[i]); + dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); + + dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + if (!dbgfs_dirs) { + debugfs_remove(dbgfs_root); + return -ENOMEM; + } + dbgfs_dirs[0] = dbgfs_root; + + return 0; +} + +/* + * Functions for the initialization + */ + +static int __init damon_dbgfs_init(void) +{ + int rc; + + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) { + pr_err("%s: dbgfs ctxs alloc failed\n", __func__); + return -ENOMEM; + } + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs ctx alloc failed\n", __func__); + return -ENOMEM; + } + dbgfs_nr_ctxs = 1; + + rc = __damon_dbgfs_init(); + if (rc) { + kfree(dbgfs_ctxs[0]); + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs init failed\n", __func__); + } + + return rc; +} + +module_init(damon_dbgfs_init); From 729cd45732571206c92ab31b1eb6fbf3fc1eac51 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:48 +1000 Subject: [PATCH 281/365] mm-damon-implement-a-debugfs-based-user-space-interface-fix remove unneeded "alloc failed" printks Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 322f624710d7c..5e2c9b5e7eace 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -373,14 +373,11 @@ static int __init damon_dbgfs_init(void) int rc; dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); - if (!dbgfs_ctxs) { - pr_err("%s: dbgfs ctxs alloc failed\n", __func__); + if (!dbgfs_ctxs) return -ENOMEM; - } dbgfs_ctxs[0] = dbgfs_new_ctx(); if (!dbgfs_ctxs[0]) { kfree(dbgfs_ctxs); - pr_err("%s: dbgfs ctx alloc failed\n", __func__); return -ENOMEM; } dbgfs_nr_ctxs = 1; From b3044aa54c441809f72ef2d6b3963b51f7689aee Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Aug 2021 09:59:48 +1000 Subject: [PATCH 282/365] mm-damon-implement-a-debugfs-based-user-space-interface-fix-fix replace macro with static inline Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 5e2c9b5e7eace..d2e0a547eb3f4 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -97,8 +97,10 @@ static ssize_t dbgfs_attrs_write(struct file *file, return ret; } -#define targetid_is_pid(ctx) \ - (ctx->primitive.target_valid == damon_va_target_valid) +static inline bool targetid_is_pid(const struct damon_ctx *ctx) +{ + return ctx->primitive.target_valid == damon_va_target_valid; +} static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { From c81dfd17373023fcd885dfc2095ade1646aa21b8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:48 +1000 Subject: [PATCH 283/365] mm/damon/dbgfs: export kdamond pid to the user space For CPU usage accounting, knowing pid of the monitoring thread could be helpful. For example, users could use cpuaccount cgroups with the pid. This commit therefore exports the pid of currently running monitoring thread to the user space via 'kdamond_pid' file in the debugfs directory. Link: https://lkml.kernel.org/r/20210716081449.22187-9-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index d2e0a547eb3f4..e850be4077f55 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -239,6 +239,32 @@ static ssize_t dbgfs_target_ids_write(struct file *file, return ret; } +static ssize_t dbgfs_kdamond_pid_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); + else + len = scnprintf(kbuf, count, "none\n"); + mutex_unlock(&ctx->kdamond_lock); + if (!len) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + static int damon_dbgfs_open(struct inode *inode, struct file *file) { file->private_data = inode->i_private; @@ -258,10 +284,17 @@ static const struct file_operations target_ids_fops = { .write = dbgfs_target_ids_write, }; +static const struct file_operations kdamond_pid_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_kdamond_pid_read, +}; + static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) { - const char * const file_names[] = {"attrs", "target_ids"}; - const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops}; + const char * const file_names[] = {"attrs", "target_ids", + "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops, + &kdamond_pid_fops}; int i; for (i = 0; i < ARRAY_SIZE(file_names); i++) From f904bccdf143d5aee2803066c9acafa538c9761e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:48 +1000 Subject: [PATCH 284/365] mm/damon/dbgfs: support multiple contexts In some use cases, users would want to run multiple monitoring context. For example, if a user wants a high precision monitoring and dedicating multiple CPUs for the job is ok, because DAMON creates one monitoring thread per one context, the user can split the monitoring target regions into multiple small regions and create one context for each region. Or, someone might want to simultaneously monitor different address spaces, e.g., both virtual address space and physical address space. The DAMON's API allows such usage, but 'damon-dbgfs' does not. Therefore, only kernel space DAMON users can do multiple contexts monitoring. This commit allows the user space DAMON users to use multiple contexts monitoring by introducing two new 'damon-dbgfs' debugfs files, 'mk_context' and 'rm_context'. Users can create a new monitoring context by writing the desired name of the new context to 'mk_context'. Then, a new directory with the name and having the files for setting of the context ('attrs', 'target_ids' and 'record') will be created under the debugfs directory. Writing the name of the context to remove to 'rm_context' will remove the related context and directory. Link: https://lkml.kernel.org/r/20210716081449.22187-10-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index e850be4077f55..31ad550ecba2d 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -18,6 +18,7 @@ static struct damon_ctx **dbgfs_ctxs; static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; +static DEFINE_MUTEX(damon_dbgfs_lock); /* * Returns non-empty string on success, negative error code otherwise. @@ -328,6 +329,186 @@ static struct damon_ctx *dbgfs_new_ctx(void) return ctx; } +static void dbgfs_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_ctx(ctx); +} + +/* + * Make a context of @name and create a debugfs directory for it. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Returns 0 on success, negative error code otherwise. + */ +static int dbgfs_mk_context(char *name) +{ + struct dentry *root, **new_dirs, *new_dir; + struct damon_ctx **new_ctxs, *new_ctx; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_ctxs) + return -ENOMEM; + dbgfs_ctxs = new_ctxs; + + new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + dbgfs_dirs = new_dirs; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + new_dir = debugfs_create_dir(name, root); + dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; + + new_ctx = dbgfs_new_ctx(); + if (!new_ctx) { + debugfs_remove(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = NULL; + return -ENOMEM; + } + + dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; + dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], + dbgfs_ctxs[dbgfs_nr_ctxs]); + dbgfs_nr_ctxs++; + + return 0; +} + +static ssize_t dbgfs_mk_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + char *ctx_name; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + err = dbgfs_mk_context(ctx_name); + if (err) + ret = err; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +/* + * Remove a context of @name and its debugfs directory. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Return 0 on success, negative error code otherwise. + */ +static int dbgfs_rm_context(char *name) +{ + struct dentry *root, *dir, **new_dirs; + struct damon_ctx **new_ctxs; + int i, j; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + dir = debugfs_lookup(name, root); + if (!dir) + return -ENOENT; + + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), + GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + + new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), + GFP_KERNEL); + if (!new_ctxs) { + kfree(new_dirs); + return -ENOMEM; + } + + for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { + if (dbgfs_dirs[i] == dir) { + debugfs_remove(dbgfs_dirs[i]); + dbgfs_destroy_ctx(dbgfs_ctxs[i]); + continue; + } + new_dirs[j] = dbgfs_dirs[i]; + new_ctxs[j++] = dbgfs_ctxs[i]; + } + + kfree(dbgfs_dirs); + kfree(dbgfs_ctxs); + + dbgfs_dirs = new_dirs; + dbgfs_ctxs = new_ctxs; + dbgfs_nr_ctxs--; + + return 0; +} + +static ssize_t dbgfs_rm_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret = count; + int err; + char *ctx_name; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + err = dbgfs_rm_context(ctx_name); + if (err) + ret = err; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + static ssize_t dbgfs_monitor_on_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -370,6 +551,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return ret; } +static const struct file_operations mk_contexts_fops = { + .write = dbgfs_mk_context_write, +}; + +static const struct file_operations rm_contexts_fops = { + .write = dbgfs_rm_context_write, +}; + static const struct file_operations monitor_on_fops = { .read = dbgfs_monitor_on_read, .write = dbgfs_monitor_on_write, @@ -378,8 +567,10 @@ static const struct file_operations monitor_on_fops = { static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; - const char * const file_names[] = {"monitor_on"}; - const struct file_operations *fops[] = {&monitor_on_fops}; + const char * const file_names[] = {"mk_contexts", "rm_contexts", + "monitor_on"}; + const struct file_operations *fops[] = {&mk_contexts_fops, + &rm_contexts_fops, &monitor_on_fops}; int i; dbgfs_root = debugfs_create_dir("damon", NULL); From 9ddc3621ff50faf5c4a7b83fd896ede8010c53a9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:48 +1000 Subject: [PATCH 285/365] Documentation: add documents for DAMON This commit adds documents for DAMON under `Documentation/admin-guide/mm/damon/` and `Documentation/vm/damon/`. Link: https://lkml.kernel.org/r/20210716081449.22187-11-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Fernand Sieber Reviewed-by: Markus Boehme Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/mm/damon/index.rst | 15 ++ Documentation/admin-guide/mm/damon/start.rst | 114 +++++++++++++ Documentation/admin-guide/mm/damon/usage.rst | 112 +++++++++++++ Documentation/admin-guide/mm/index.rst | 1 + Documentation/vm/damon/api.rst | 20 +++ Documentation/vm/damon/design.rst | 166 +++++++++++++++++++ Documentation/vm/damon/faq.rst | 51 ++++++ Documentation/vm/damon/index.rst | 30 ++++ Documentation/vm/index.rst | 1 + 9 files changed, 510 insertions(+) create mode 100644 Documentation/admin-guide/mm/damon/index.rst create mode 100644 Documentation/admin-guide/mm/damon/start.rst create mode 100644 Documentation/admin-guide/mm/damon/usage.rst create mode 100644 Documentation/vm/damon/api.rst create mode 100644 Documentation/vm/damon/design.rst create mode 100644 Documentation/vm/damon/faq.rst create mode 100644 Documentation/vm/damon/index.rst diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst new file mode 100644 index 0000000000000..8c5dde3a57544 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -0,0 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +Monitoring Data Accesses +======================== + +:doc:`DAMON ` allows light-weight data access monitoring. +Using DAMON, users can analyze the memory access patterns of their systems and +optimize those. + +.. toctree:: + :maxdepth: 2 + + start + usage diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst new file mode 100644 index 0000000000000..d5eb89a8fc386 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -0,0 +1,114 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Getting Started +=============== + +This document briefly describes how you can use DAMON by demonstrating its +default user space tool. Please note that this document describes only a part +of its features for brevity. Please refer to :doc:`usage` for more details. + + +TL; DR +====== + +Follow the commands below to monitor and visualize the memory access pattern of +your workload. :: + + # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot + # mount -t debugfs none /sys/kernel/debug/ + # git clone https://github.com/awslabs/damo + # ./damo/damo record $(pidof ) + # ./damo/damo report heat --plot_ascii + +The final command draws the access heatmap of ````. The heatmap +shows which memory region (x-axis) is accessed when (y-axis) and how frequently +(number; the higher the more accesses have been observed). :: + + 111111111111111111111111111111111111111111111111111111110000 + 111121111111111111111111111111211111111111111111111111110000 + 000000000000000000000000000000000000000000000000001555552000 + 000000000000000000000000000000000000000000000222223555552000 + 000000000000000000000000000000000000000011111677775000000000 + 000000000000000000000000000000000000000488888000000000000000 + 000000000000000000000000000000000177888400000000000000000000 + 000000000000000000000000000046666522222100000000000000000000 + 000000000000000000000014444344444300000000000000000000000000 + 000000000000000002222245555510000000000000000000000000000000 + # access_frequency: 0 1 2 3 4 5 6 7 8 9 + # x-axis: space (140286319947776-140286426374096: 101.496 MiB) + # y-axis: time (605442256436361-605479951866441: 37.695430s) + # resolution: 60x10 (1.692 MiB and 3.770s for each character) + + +Prerequisites +============= + +Kernel +------ + +You should first ensure your system is running on a kernel built with +``CONFIG_DAMON_*=y``. + + +User Space Tool +--------------- + +For the demonstration, we will use the default user space tool for DAMON, +called DAMON Operator (DAMO). It is available at +https://github.com/awslabs/damo. The examples below assume that ``damo`` is on +your ``$PATH``. It's not mandatory, though. + +Because DAMO is using the debugfs interface (refer to :doc:`usage` for the +detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as +below:: + + # mount -t debugfs none /sys/kernel/debug/ + +or append the following line to your ``/etc/fstab`` file so that your system +can automatically mount debugfs upon booting:: + + debugfs /sys/kernel/debug debugfs defaults 0 0 + + +Recording Data Access Patterns +============================== + +The commands below record the memory access patterns of a program and save the +monitoring results to a file. :: + + $ git clone https://github.com/sjp38/masim + $ cd masim; make; ./masim ./configs/zigzag.cfg & + $ sudo damo record -o damon.data $(pidof masim) + +The first two lines of the commands download an artificial memory access +generator program and run it in the background. The generator will repeatedly +access two 100 MiB sized memory regions one by one. You can substitute this +with your real workload. The last line asks ``damo`` to record the access +pattern in the ``damon.data`` file. + + +Visualizing Recorded Patterns +============================= + +The following three commands visualize the recorded access patterns and save +the results as separate image files. :: + + $ damo report heats --heatmap access_pattern_heatmap.png + $ damo report wss --range 0 101 1 --plot wss_dist.png + $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png + +- ``access_pattern_heatmap.png`` will visualize the data access pattern in a + heatmap, showing which memory region (y-axis) got accessed when (x-axis) + and how frequently (color). +- ``wss_dist.png`` will show the distribution of the working set size. +- ``wss_chron_change.png`` will show how the working set size has + chronologically changed. + +You can view the visualizations of this example workload at [1]_. +Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_. + +.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns +.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html +.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html +.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst new file mode 100644 index 0000000000000..a72cda374abac --- /dev/null +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -0,0 +1,112 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Detailed Usages +=============== + +DAMON provides below three interfaces for different users. + +- *DAMON user space tool.* + This is for privileged people such as system administrators who want a + just-working human-friendly interface. Using this, users can use the DAMON’s + major features in a human-friendly way. It may not be highly tuned for + special cases, though. It supports only virtual address spaces monitoring. +- *debugfs interface.* + This is for privileged user space programmers who want more optimized use of + DAMON. Using this, users can use DAMON’s major features by reading + from and writing to special debugfs files. Therefore, you can write and use + your personalized DAMON debugfs wrapper programs that reads/writes the + debugfs files instead of you. The DAMON user space tool is also a reference + implementation of such programs. It supports only virtual address spaces + monitoring. +- *Kernel Space Programming Interface.* + This is for kernel space programmers. Using this, users can utilize every + feature of DAMON most flexibly and efficiently by writing kernel space + DAMON application programs for you. You can even extend DAMON for various + address spaces. + +Nevertheless, you could write your own user space tool using the debugfs +interface. A reference implementation is available at +https://github.com/awslabs/damo. If you are a kernel programmer, you could +refer to :doc:`/vm/damon/api` for the kernel space programming interface. For +the reason, this document describes only the debugfs interface + +debugfs Interface +================= + +DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under +its debugfs directory, ``/damon/``. + + +Attributes +---------- + +Users can get and set the ``sampling interval``, ``aggregation interval``, +``regions update interval``, and min/max number of monitoring target regions by +reading from and writing to the ``attrs`` file. To know about the monitoring +attributes in detail, please refer to the :doc:`/vm/damon/design`. For +example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and +1000, and then check it again:: + + # cd /damon + # echo 5000 100000 1000000 10 1000 > attrs + # cat attrs + 5000 100000 1000000 10 1000 + + +Target IDs +---------- + +Some types of address spaces supports multiple monitoring target. For example, +the virtual memory address spaces monitoring can have multiple processes as the +monitoring targets. Users can set the targets by writing relevant id values of +the targets to, and get the ids of the current targets by reading from the +``target_ids`` file. In case of the virtual address spaces monitoring, the +values should be pids of the monitoring target processes. For example, below +commands set processes having pids 42 and 4242 as the monitoring targets and +check it again:: + + # cd /damon + # echo 42 4242 > target_ids + # cat target_ids + 42 4242 + +Note that setting the target ids doesn't start the monitoring. + + +Turning On/Off +-------------- + +Setting the files as described above doesn't incur effect unless you explicitly +start the monitoring. You can start, stop, and check the current status of the +monitoring by writing to and reading from the ``monitor_on`` file. Writing +``on`` to the file starts the monitoring of the targets with the attributes. +Writing ``off`` to the file stops those. DAMON also stops if every target +process is terminated. Below example commands turn on, off, and check the +status of DAMON:: + + # cd /damon + # echo on > monitor_on + # echo off > monitor_on + # cat monitor_on + off + +Please note that you cannot write to the above-mentioned debugfs files while +the monitoring is turned on. If you write to the files while DAMON is running, +an error code such as ``-EBUSY`` will be returned. + + +Tracepoint for Monitoring Results +================================= + +DAMON provides the monitoring results via a tracepoint, +``damon:damon_aggregated``. While the monitoring is turned on, you could +record the tracepoint events and show results using tracepoint supporting tools +like ``perf``. For example:: + + # echo on > monitor_on + # perf record -e damon:damon_aggregated & + # sleep 5 + # kill 9 $(pidof perf) + # echo off > monitor_on + # perf script diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index 4b14d8b50e9e8..cbd19d5e625f3 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -27,6 +27,7 @@ the Linux memory management. concepts cma_debugfs + damon/index hugetlbpage idle_page_tracking ksm diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst new file mode 100644 index 0000000000000..08f34df45523a --- /dev/null +++ b/Documentation/vm/damon/api.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +API Reference +============= + +Kernel space programs can use every feature of DAMON using below APIs. All you +need to do is including ``damon.h``, which is located in ``include/linux/`` of +the source tree. + +Structures +========== + +.. kernel-doc:: include/linux/damon.h + + +Functions +========= + +.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst new file mode 100644 index 0000000000000..b05159c295f4d --- /dev/null +++ b/Documentation/vm/damon/design.rst @@ -0,0 +1,166 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +Design +====== + +Configurable Layers +=================== + +DAMON provides data access monitoring functionality while making the accuracy +and the overhead controllable. The fundamental access monitorings require +primitives that dependent on and optimized for the target address space. On +the other hand, the accuracy and overhead tradeoff mechanism, which is the core +of DAMON, is in the pure logic space. DAMON separates the two parts in +different layers and defines its interface to allow various low level +primitives implementations configurable with the core logic. + +Due to this separated design and the configurable interface, users can extend +DAMON for any address space by configuring the core logics with appropriate low +level primitive implementations. If appropriate one is not provided, users can +implement the primitives on their own. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +primitives, those will be easily configurable. + + +Reference Implementations of Address Space Specific Primitives +============================================================== + +The low level primitives for the fundamental access monitoring are defined in +two parts: + +1. Identification of the monitoring target address range for the address space. +2. Access check of specific address range in the target space. + +DAMON currently provides the implementation of the primitives for only the +virtual address spaces. Below two subsections describe how it works. + + +VMA-based Target Address Range Construction +------------------------------------------- + +Only small parts in the super-huge virtual address space of the processes are +mapped to the physical memory and accessed. Thus, tracking the unmapped +address regions is just wasteful. However, because DAMON can deal with some +level of noise using the adaptive regions adjustment mechanism, tracking every +mapping is not strictly required but could even incur a high overhead in some +cases. That said, too huge unmapped areas inside the monitoring target should +be removed to not take the time for the adaptive mechanism. + +For the reason, this implementation converts the complex mappings to three +distinct regions that cover every mapped area of the address space. The two +gaps between the three regions are the two biggest unmapped areas in the given +address space. The two biggest unmapped areas would be the gap between the +heap and the uppermost mmap()-ed region, and the gap between the lowermost +mmap()-ed region and the stack in most of the cases. Because these gaps are +exceptionally huge in usual address spaces, excluding these will be sufficient +to make a reasonable trade-off. Below shows this in detail:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +PTE Accessed-bit Based Access Check +----------------------------------- + +The implementation for the virtual address space uses PTE Accessed-bit for +basic access checks. It finds the relevant PTE Accessed bit from the address +by walking the page table for the target task of the address. In this way, the +implementation finds and clears the bit for next sampling target address and +checks whether the bit set again after one sampling period. This could disturb +other kernel subsystems using the Accessed bits, namely Idle page tracking and +the reclaim logic. To avoid such disturbances, DAMON makes it mutually +exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page +flags to solve the conflict with the reclaim logic, as Idle page tracking does. + + +Address Space Independent Core Mechanisms +========================================= + +Below four sections describe each of the DAMON core mechanisms and the five +monitoring attributes, ``sampling interval``, ``aggregation interval``, +``regions update interval``, ``minimum number of regions``, and ``maximum +number of regions``. + + +Access Frequency Monitoring +--------------------------- + +The output of DAMON says what pages are how frequently accessed for a given +duration. The resolution of the access frequency is controlled by setting +``sampling interval`` and ``aggregation interval``. In detail, DAMON checks +access to each page per ``sampling interval`` and aggregates the results. In +other words, counts the number of the accesses to each page. After each +``aggregation interval`` passes, DAMON calls callback functions that previously +registered by users so that users can read the aggregated results and then +clears the results. This can be described in below simple pseudo-code:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +The monitoring overhead of this mechanism will arbitrarily increase as the +size of the target workload grows. + + +Region Based Sampling +--------------------- + +To avoid the unbounded increase of the overhead, DAMON groups adjacent pages +that assumed to have the same access frequencies into a region. As long as the +assumption (pages in a region have the same access frequencies) is kept, only +one page in the region is required to be checked. Thus, for each ``sampling +interval``, DAMON randomly picks one page in each region, waits for one +``sampling interval``, checks whether the page is accessed meanwhile, and +increases the access frequency of the region if so. Therefore, the monitoring +overhead is controllable by setting the number of regions. DAMON allows users +to set the minimum and the maximum number of regions for the trade-off. + +This scheme, however, cannot preserve the quality of the output if the +assumption is not guaranteed. + + +Adaptive Regions Adjustment +--------------------------- + +Even somehow the initial monitoring target regions are well constructed to +fulfill the assumption (pages in same region have similar access frequencies), +the data access pattern can be dynamically changed. This will result in low +monitoring quality. To keep the assumption as much as possible, DAMON +adaptively merges and splits each region based on their access frequency. + +For each ``aggregation interval``, it compares the access frequencies of +adjacent regions and merges those if the frequency difference is small. Then, +after it reports and clears the aggregated access frequency of each region, it +splits each region into two or three regions if the total number of regions +will not exceed the user-specified maximum number of regions after the split. + +In this way, DAMON provides its best-effort quality and minimal overhead while +keeping the bounds users set for their trade-off. + + +Dynamic Target Space Updates Handling +------------------------------------- + +The monitoring target address range could dynamically changed. For example, +virtual memory could be dynamically mapped and unmapped. Physical memory could +be hot-plugged. + +As the changes could be quite frequent in some cases, DAMON checks the dynamic +memory mapping changes and applies it to the abstracted target area only for +each of a user-specified time interval (``regions update interval``). diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst new file mode 100644 index 0000000000000..cb3d8b585a8b3 --- /dev/null +++ b/Documentation/vm/damon/faq.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Frequently Asked Questions +========================== + +Why a new subsystem, instead of extending perf or other user space tools? +========================================================================= + +First, because it needs to be lightweight as much as possible so that it can be +used online, any unnecessary overhead such as kernel - user space context +switching cost should be avoided. Second, DAMON aims to be used by other +programs including the kernel. Therefore, having a dependency on specific +tools like perf is not desirable. These are the two biggest reasons why DAMON +is implemented in the kernel space. + + +Can 'idle pages tracking' or 'perf mem' substitute DAMON? +========================================================= + +Idle page tracking is a low level primitive for access check of the physical +address space. 'perf mem' is similar, though it can use sampling to minimize +the overhead. On the other hand, DAMON is a higher-level framework for the +monitoring of various address spaces. It is focused on memory management +optimization and provides sophisticated accuracy/overhead handling mechanisms. +Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of +DAMON's output, but cannot substitute DAMON. + + +Does DAMON support virtual memory only? +======================================= + +No. The core of the DAMON is address space independent. The address space +specific low level primitive parts including monitoring target regions +constructions and actual access checks can be implemented and configured on the +DAMON core by the users. In this way, DAMON users can monitor any address +space with any access check technique. + +Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based +implementations of the address space dependent functions for the virtual memory +by default, for a reference and convenient use. In near future, we will +provide those for physical memory address space. + + +Can I simply monitor page granularity? +====================================== + +Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the +working set size divided by the page size. Because the monitoring target +regions size is forced to be ``>=page size``, the region split will make no +effect. diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst new file mode 100644 index 0000000000000..a2858baf3bf1d --- /dev/null +++ b/Documentation/vm/damon/index.rst @@ -0,0 +1,30 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +DAMON is a data access monitoring framework subsystem for the Linux kernel. +The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it + + - *accurate* (the monitoring output is useful enough for DRAM level memory + management; It might not appropriate for CPU Cache levels, though), + - *light-weight* (the monitoring overhead is low enough to be applied online), + and + - *scalable* (the upper-bound of the overhead is in constant range regardless + of the size of target workloads). + +Using this framework, therefore, the kernel's memory management mechanisms can +make advanced decisions. Experimental memory management optimization works +that incurring high data accesses monitoring overhead could implemented again. +In user space, meanwhile, users who have some special workloads can write +personalized applications for better understanding and optimizations of their +workloads and systems. + +.. toctree:: + :maxdepth: 2 + + faq + design + api + plans diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index eff5fbd492d08..b51f0d8992f8f 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -32,6 +32,7 @@ descriptions of data structures and algorithms. arch_pgtable_helpers balance cleancache + damon/index free_page_reporting frontswap highmem From 025b245416239eaa7430d698b86896b36b6851e6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:49 +1000 Subject: [PATCH 286/365] mm/damon: add kunit tests This commit adds kunit based unit tests for the core and the virtual address spaces monitoring primitives of DAMON. Link: https://lkml.kernel.org/r/20210716081449.22187-12-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Fernand Sieber Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/Kconfig | 36 +++++ mm/damon/core-test.h | 253 ++++++++++++++++++++++++++++++++ mm/damon/core.c | 7 + mm/damon/dbgfs-test.h | 126 ++++++++++++++++ mm/damon/dbgfs.c | 2 + mm/damon/vaddr-test.h | 329 ++++++++++++++++++++++++++++++++++++++++++ mm/damon/vaddr.c | 7 + 7 files changed, 760 insertions(+) create mode 100644 mm/damon/core-test.h create mode 100644 mm/damon/dbgfs-test.h create mode 100644 mm/damon/vaddr-test.h diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index c8e3dba6fb4cf..37024798a97ca 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,6 +12,18 @@ config DAMON See https://damonitor.github.io/doc/html/latest-damon/index.html for more information. +config DAMON_KUNIT_TEST + bool "Test for damon" if !KUNIT_ALL_TESTS + depends on DAMON && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + config DAMON_VADDR bool "Data access monitoring primitives for virtual address spaces" depends on DAMON && MMU @@ -20,6 +32,18 @@ config DAMON_VADDR This builds the default data access monitoring primitives for DAMON that works for virtual address spaces. +config DAMON_VADDR_KUNIT_TEST + bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS + depends on DAMON_VADDR && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON virtual addresses primitives Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + config DAMON_DBGFS bool "DAMON debugfs interface" depends on DAMON_VADDR && DEBUG_FS @@ -29,4 +53,16 @@ config DAMON_DBGFS If unsure, say N. +config DAMON_DBGFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_DBGFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON debugfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + endmenu diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h new file mode 100644 index 0000000000000..c938a9c34e6c5 --- /dev/null +++ b/mm/damon/core-test.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_KUNIT_TEST + +#ifndef _DAMON_CORE_TEST_H +#define _DAMON_CORE_TEST_H + +#include + +static void damon_test_regions(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + + r = damon_new_region(1, 2); + KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); + KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + + t = damon_new_target(42); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); + + damon_del_region(r, t); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_free_target(t); +} + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static void damon_test_target(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + + t = damon_new_target(42); + KUNIT_EXPECT_EQ(test, 42ul, t->id); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_add_target(c, t); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); + + damon_destroy_target(t); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_destroy_ctx(c); +} + +/* + * Test kdamond_reset_aggregated() + * + * DAMON checks access to each region and aggregates this information as the + * access frequency of each region. In detail, it increases '->nr_accesses' of + * regions that an access has confirmed. 'kdamond_reset_aggregated()' flushes + * the aggregated information ('->nr_accesses' of each regions) to the result + * buffer. As a result of the flushing, the '->nr_accesses' of regions are + * initialized to zero. + */ +static void damon_test_aggregate(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long target_ids[] = {1, 2, 3}; + unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; + unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; + unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; + struct damon_target *t; + struct damon_region *r; + int it, ir; + + damon_set_targets(ctx, target_ids, 3); + + it = 0; + damon_for_each_target(t, ctx) { + for (ir = 0; ir < 3; ir++) { + r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + r->nr_accesses = accesses[it][ir]; + damon_add_region(r, t); + } + it++; + } + kdamond_reset_aggregated(ctx); + it = 0; + damon_for_each_target(t, ctx) { + ir = 0; + /* '->nr_accesses' should be zeroed */ + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + ir++; + } + /* regions should be preserved */ + KUNIT_EXPECT_EQ(test, 3, ir); + it++; + } + /* targets also should be preserved */ + KUNIT_EXPECT_EQ(test, 3, it); + + damon_destroy_ctx(ctx); +} + +static void damon_test_split_at(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(42); + r = damon_new_region(0, 100); + damon_add_region(r, t); + damon_split_region_at(c, t, r, 25); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); + + r = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r->ar.start, 25ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 100ul); + + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_merge_two(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2, *r3; + int i; + + t = damon_new_target(42); + r = damon_new_region(0, 100); + r->nr_accesses = 10; + damon_add_region(r, t); + r2 = damon_new_region(100, 300); + r2->nr_accesses = 20; + damon_add_region(r2, t); + + damon_merge_two_regions(t, r, r2); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + + i = 0; + damon_for_each_region(r3, t) { + KUNIT_EXPECT_PTR_EQ(test, r, r3); + i++; + } + KUNIT_EXPECT_EQ(test, i, 1); + + damon_free_target(t); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +static void damon_test_merge_regions_of(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + + unsigned long saddrs[] = {0, 114, 130, 156, 170}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + int i; + + t = damon_new_target(42); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + r->nr_accesses = nrs[i]; + damon_add_region(r, t); + } + + damon_merge_regions_of(t, 9, 9999); + /* 0-112, 114-130, 130-156, 156-170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + for (i = 0; i < 5; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); + KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); + } + damon_free_target(t); +} + +static void damon_test_split_regions_of(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(42); + r = damon_new_region(0, 22); + damon_add_region(r, t); + damon_split_regions_of(c, t, 2); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u); + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(0, 220); + damon_add_region(r, t); + damon_split_regions_of(c, t, 4); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u); + damon_free_target(t); + damon_destroy_ctx(c); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_target), + KUNIT_CASE(damon_test_regions), + KUNIT_CASE(damon_test_aggregate), + KUNIT_CASE(damon_test_split_at), + KUNIT_CASE(damon_test_merge_two), + KUNIT_CASE(damon_test_merge_regions_of), + KUNIT_CASE(damon_test_split_regions_of), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_CORE_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 59033488402e8..30e9211f494a7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -16,6 +16,11 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_DAMON_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + /* Get a random number in [l, r) */ #define damon_rand(l, r) (l + prandom_u32_max(r - l)) @@ -711,3 +716,5 @@ static int kdamond_fn(void *data) do_exit(0); } + +#include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h new file mode 100644 index 0000000000000..930e83bceef03 --- /dev/null +++ b/mm/damon/dbgfs-test.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON Debugfs Interface Unit Tests + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST + +#ifndef _DAMON_DBGFS_TEST_H +#define _DAMON_DBGFS_TEST_H + +#include + +static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +{ + char *question; + unsigned long *answers; + unsigned long expected[] = {12, 35, 46}; + ssize_t nr_integers = 0, i; + + question = "123"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + kfree(answers); + + question = "123abc"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + kfree(answers); + + question = "a123"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "12 35"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 46"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 abc 46"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < 2; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = ""; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "\n"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); +} + +static void damon_dbgfs_test_set_targets(struct kunit *test) +{ + struct damon_ctx *ctx = dbgfs_new_ctx(); + unsigned long ids[] = {1, 2, 3}; + char buf[64]; + + /* Make DAMON consider target id as plain number */ + ctx->primitive.target_valid = NULL; + ctx->primitive.cleanup = NULL; + + damon_set_targets(ctx, ids, 3); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); + + damon_set_targets(ctx, NULL, 0); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + damon_set_targets(ctx, (unsigned long []){1, 2}, 2); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); + + damon_set_targets(ctx, (unsigned long []){2}, 1); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); + + damon_set_targets(ctx, NULL, 0); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_destroy_ctx(ctx); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_set_targets), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-dbgfs", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 31ad550ecba2d..faee070977d80 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -619,3 +619,5 @@ static int __init damon_dbgfs_init(void) } module_init(damon_dbgfs_init); + +#include "dbgfs-test.h" diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h new file mode 100644 index 0000000000000..1f5c13257dbaf --- /dev/null +++ b/mm/damon/vaddr-test.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST + +#ifndef _DAMON_VADDR_TEST_H +#define _DAMON_VADDR_TEST_H + +#include + +static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +{ + int i, j; + unsigned long largest_gap, gap; + + if (!nr_vmas) + return; + + for (i = 0; i < nr_vmas - 1; i++) { + vmas[i].vm_next = &vmas[i + 1]; + + vmas[i].vm_rb.rb_left = NULL; + vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; + + largest_gap = 0; + for (j = i; j < nr_vmas; j++) { + if (j == 0) + continue; + gap = vmas[j].vm_start - vmas[j - 1].vm_end; + if (gap > largest_gap) + largest_gap = gap; + } + vmas[i].rb_subtree_gap = largest_gap; + } + vmas[i].vm_next = NULL; + vmas[i].vm_rb.rb_right = NULL; + vmas[i].rb_subtree_gap = 0; +} + +/* + * Test __damon_va_three_regions() function + * + * In case of virtual memory address spaces monitoring, DAMON converts the + * complex and dynamic memory mappings of each target task to three + * discontiguous regions which cover every mapped areas. However, the three + * regions should not include the two biggest unmapped areas in the original + * mapping, because the two biggest areas are normally the areas between 1) + * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack. + * Because these two unmapped areas are very huge but obviously never accessed, + * covering the region is just a waste. + * + * '__damon_va_three_regions() receives an address space of a process. It + * first identifies the start of mappings, end of mappings, and the two biggest + * unmapped areas. After that, based on the information, it constructs the + * three regions and returns. For more detail, refer to the comment of + * 'damon_init_regions_of()' function definition in 'mm/damon.c' file. + * + * For example, suppose virtual address ranges of 10-20, 20-25, 200-210, + * 210-220, 300-305, and 307-330 (Other comments represent this mappings in + * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are + * mapped. To cover every mappings, the three regions should start with 10, + * and end with 305. The process also has three unmapped areas, 25-200, + * 220-300, and 305-307. Among those, 25-200 and 220-300 are the biggest two + * unmapped areas, and thus it should be converted to three regions of 10-25, + * 200-220, and 300-330. + */ +static void damon_test_three_regions_in_vmas(struct kunit *test) +{ + struct damon_addr_range regions[3] = {0,}; + /* 10-20-25, 200-210-220, 300-305, 307-330 */ + struct vm_area_struct vmas[] = { + (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, + (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, + (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, + (struct vm_area_struct) {.vm_start = 210, .vm_end = 220}, + (struct vm_area_struct) {.vm_start = 300, .vm_end = 305}, + (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, + }; + + __link_vmas(vmas, 6); + + __damon_va_three_regions(&vmas[0], regions); + + KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); + KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); + KUNIT_EXPECT_EQ(test, 200ul, regions[1].start); + KUNIT_EXPECT_EQ(test, 220ul, regions[1].end); + KUNIT_EXPECT_EQ(test, 300ul, regions[2].start); + KUNIT_EXPECT_EQ(test, 330ul, regions[2].end); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +/* + * Test 'damon_va_apply_three_regions()' + * + * test kunit object + * regions an array containing start/end addresses of current + * monitoring target regions + * nr_regions the number of the addresses in 'regions' + * three_regions The three regions that need to be applied now + * expected start/end addresses of monitoring target regions that + * 'three_regions' are applied + * nr_expected the number of addresses in 'expected' + * + * The memory mapping of the target processes changes dynamically. To follow + * the change, DAMON periodically reads the mappings, simplifies it to the + * three regions, and updates the monitoring target regions to fit in the three + * regions. The update of current target regions is the role of + * 'damon_va_apply_three_regions()'. + * + * This test passes the given target regions and the new three regions that + * need to be applied to the function and check whether it updates the regions + * as expected. + */ +static void damon_do_test_apply_three_regions(struct kunit *test, + unsigned long *regions, int nr_regions, + struct damon_addr_range *three_regions, + unsigned long *expected, int nr_expected) +{ + struct damon_ctx *ctx = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(42); + for (i = 0; i < nr_regions / 2; i++) { + r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + damon_add_region(r, t); + } + damon_add_target(ctx, t); + + damon_va_apply_three_regions(t, three_regions); + + for (i = 0; i < nr_expected / 2; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); + KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); + } + + damon_destroy_ctx(ctx); +} + +/* + * This function test most common case where the three big regions are only + * slightly changed. Target regions should adjust their boundary (10-20-30, + * 50-55, 70-80, 90-100) to fit with the new big regions or remove target + * regions (57-79) that now out of the three regions. + */ +static void damon_test_apply_three_regions1(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 45-55, 73-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 45, .end = 55}, + (struct damon_addr_range){.start = 73, .end = 104} }; + /* 5-20-27, 45-55, 73-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 45, 55, + 73, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test slightly bigger change. Similar to above, but the second big region + * now require two target regions (50-55, 57-59) to be removed. + */ +static void damon_test_apply_three_regions2(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 56-57, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 56, .end = 57}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 56-57, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 56, 57, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test a big change. The second big region has totally freed and mapped to + * different area (50-59 -> 61-63). The target regions which were in the old + * second big region (50-55-57-59) should be removed and new target region + * covering the second big region (61-63) should be created. + */ +static void damon_test_apply_three_regions3(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 61-63, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 61, .end = 63}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 61-63, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 61, 63, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test another big change. Both of the second and third big regions (50-59 + * and 70-100) has totally freed and mapped to different area (30-32 and + * 65-68). The target regions which were in the old second and third big + * regions should now be removed and new target regions covering the new second + * and third big regions should be crated. + */ +static void damon_test_apply_three_regions4(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-7, 30-32, 65-68 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 7}, + (struct damon_addr_range){.start = 30, .end = 32}, + (struct damon_addr_range){.start = 65, .end = 68} }; + /* expect 5-7, 30-32, 65-68 */ + unsigned long expected[] = {5, 7, 30, 32, 65, 68}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +static void damon_test_split_evenly(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + unsigned long i; + + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + t = damon_new_target(42); + r = damon_new_region(0, 100); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u); + + i = 0; + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10); + KUNIT_EXPECT_EQ(test, r->ar.end, i * 10); + } + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(5, 59); + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + + i = 0; + damon_for_each_region(r, t) { + if (i == 4) + break; + KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++); + KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i); + } + KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i); + KUNIT_EXPECT_EQ(test, r->ar.end, 59ul); + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(5, 6); + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); + + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, 5ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 6ul); + } + damon_free_target(t); + damon_destroy_ctx(c); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_three_regions_in_vmas), + KUNIT_CASE(damon_test_apply_three_regions1), + KUNIT_CASE(damon_test_apply_three_regions2), + KUNIT_CASE(damon_test_apply_three_regions3), + KUNIT_CASE(damon_test_apply_three_regions4), + KUNIT_CASE(damon_test_split_evenly), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-primitives", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_VADDR_TEST_H */ + +#endif /* CONFIG_DAMON_VADDR_KUNIT_TEST */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index e61211152615a..230db74132786 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -16,6 +16,11 @@ #include #include +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + /* Get a random number in [l, r) */ #define damon_rand(l, r) (l + prandom_u32_max(r - l)) @@ -605,3 +610,5 @@ void damon_va_set_primitives(struct damon_ctx *ctx) ctx->primitive.target_valid = damon_va_target_valid; ctx->primitive.cleanup = NULL; } + +#include "vaddr-test.h" From c81c9fb383c4435d6ad663a22e8b112b66d447ba Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:49 +1000 Subject: [PATCH 287/365] mm/damon: add user space selftests This commit adds a simple user space tests for DAMON. The tests are using kselftest framework. Link: https://lkml.kernel.org/r/20210716081449.22187-13-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Markus Boehme Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Fernand Sieber Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/damon/Makefile | 7 ++ .../selftests/damon/_chk_dependency.sh | 28 +++++++ .../testing/selftests/damon/debugfs_attrs.sh | 75 +++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 tools/testing/selftests/damon/Makefile create mode 100644 tools/testing/selftests/damon/_chk_dependency.sh create mode 100644 tools/testing/selftests/damon/debugfs_attrs.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile new file mode 100644 index 0000000000000..8a3f2cd9fec0c --- /dev/null +++ b/tools/testing/selftests/damon/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for damon selftests + +TEST_FILES = _chk_dependency.sh +TEST_PROGS = debugfs_attrs.sh + +include ../lib.mk diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh new file mode 100644 index 0000000000000..0189db81550be --- /dev/null +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +DBGFS=/sys/kernel/debug/damon + +if [ $EUID -ne 0 ]; +then + echo "Run as root" + exit $ksft_skip +fi + +if [ ! -d "$DBGFS" ] +then + echo "$DBGFS not found" + exit $ksft_skip +fi + +for f in attrs target_ids monitor_on +do + if [ ! -f "$DBGFS/$f" ] + then + echo "$f not found" + exit 1 + fi +done diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh new file mode 100644 index 0000000000000..bfabb19dc0d3d --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +test_write_result() { + file=$1 + content=$2 + orig_content=$3 + expect_reason=$4 + expected=$5 + + echo "$content" > "$file" + if [ $? -ne "$expected" ] + then + echo "writing $content to $file doesn't return $expected" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +test_write_succ() { + test_write_result "$1" "$2" "$3" "$4" 0 +} + +test_write_fail() { + test_write_result "$1" "$2" "$3" "$4" 1 +} + +test_content() { + file=$1 + orig_content=$2 + expected=$3 + expect_reason=$4 + + content=$(cat "$file") + if [ "$content" != "$expected" ] + then + echo "reading $file expected $expected but $content" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +source ./_chk_dependency.sh + +# Test attrs file +# =============== + +file="$DBGFS/attrs" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input" +test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields" +test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ + "min_nr_regions > max_nr_regions" +test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" +echo "$orig_content" > "$file" + +# Test target_ids file +# ==================== + +file="$DBGFS/target_ids" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input" +test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input" +test_content "$file" "$orig_content" "1 2" "non-integer was there" +test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input" +test_content "$file" "$orig_content" "" "wrong input written" +test_write_succ "$file" "" "$orig_content" "empty input" +test_content "$file" "$orig_content" "" "empty input written" +echo "$orig_content" > "$file" + +echo "PASS" From 3629dd09173d54b0378b6ee977504d22c200e7a8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 24 Aug 2021 09:59:49 +1000 Subject: [PATCH 288/365] MAINTAINERS: update for DAMON This commit updates MAINTAINERS file for DAMON related files. Link: https://lkml.kernel.org/r/20210716081449.22187-14-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Markus Boehme Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Fernand Sieber Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d7b4f32875a94..ee6d3088e955b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5125,6 +5125,17 @@ F: net/ax25/ax25_out.c F: net/ax25/ax25_timer.c F: net/ax25/sysctl_net_ax25.c +DATA ACCESS MONITOR +M: SeongJae Park +L: linux-mm@kvack.org +S: Maintained +F: Documentation/admin-guide/mm/damon/ +F: Documentation/vm/damon/ +F: include/linux/damon.h +F: include/trace/events/damon.h +F: mm/damon/ +F: tools/testing/selftests/damon/ + DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER L: netdev@vger.kernel.org S: Orphan From 630e6cc9258c93eacfcc1b3393b1687156f5de7b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 24 Aug 2021 09:59:49 +1000 Subject: [PATCH 289/365] fs/buffer.c: add debug print for __getblk_gfp() stall problem Among syzbot's unresolved hung task reports, 18 out of 65 reports contain __getblk_gfp() line in the backtrace. Since there is a comment block that says that __getblk_gfp() will lock up the machine if try_to_free_buffers() attempt from grow_dev_page() is failing, let's start from checking whether syzbot is hitting that case. This change will be removed after the bug is fixed. Link: http://lkml.kernel.org/r/9b9fcdda-c347-53ee-fdbb-8a7d11cf430e@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Al Viro Cc: Mel Gorman Cc: Michal Hocko Cc: Andi Kleen Cc: Jan Kara Cc: Jeff Layton Cc: Cc: Tim Chen Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 50 +++++++++++++++++++++++++++++++++++++++++-- include/linux/sched.h | 7 ++++++ lib/Kconfig.debug | 6 ++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 6290c3afdba48..a09e2c864cc53 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1041,6 +1060,18 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -3187,6 +3218,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3225,11 +3261,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3253,6 +3296,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8d07d88641c..b36fe121da1d2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,6 +1400,13 @@ struct task_struct { struct llist_head kretprobe_instances; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5ddd575159fb8..b91d8239c5ad2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1807,6 +1807,12 @@ config IO_STRICT_DEVMEM menu "$(SRCARCH) Debugging" +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu From fa1618d78bca088e10f9d98fb9f82ae72c80774d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 24 Aug 2021 09:59:49 +1000 Subject: [PATCH 290/365] fs/buffer.c: dump more info for __getblk_gfp() stall problem We need to dump more variables on top of "fs/buffer.c: add debug print for __getblk_gfp() stall problem". Link: http://lkml.kernel.org/r/12239545-7d8a-820f-48ba-952e2e98a05c@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a09e2c864cc53..f5384cff7e0c5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1064,9 +1064,15 @@ __getblk_slow(struct block_device *bdev, sector_t block, #ifdef CONFIG_DEBUG_AID_FOR_SYZBOT if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) continue; - printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n", current->comm, current->pid, current->getblk_executed, - current->getblk_bh_count, current->getblk_bh_state); + current->getblk_bh_count, current->getblk_bh_state, + IS_ERR_OR_NULL(bdev->bd_super) ? -1L : + bdev->bd_super->s_blocksize, size, + IS_ERR_OR_NULL(bdev->bd_super) ? -1 : + bdev->bd_super->s_blocksize_bits, + IS_ERR_OR_NULL(bdev->bd_inode) ? -1 : + bdev->bd_inode->i_blkbits); current->getblk_executed = 0; current->getblk_bh_count = 0; current->getblk_bh_state = 0; From c9776668438f939af982057cd544accd171b157b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 24 Aug 2021 09:59:50 +1000 Subject: [PATCH 291/365] kernel/hung_task.c: Monitor killed tasks. syzbot's current top report is "no output from test machine" where the userspace process failed to spawn a new test process for 300 seconds for some reason. One of reasons which can result in this report is that an already spawned test process was unable to terminate (e.g. trapped at an unkillable retry loop due to some bug) after SIGKILL was sent to that process. Therefore, reporting when a thread is failing to terminate despite a fatal signal is pending would give us more useful information. In the context of syzbot's testing where there are only 2 CPUs in the target VM (which means that only small number of threads and not so much memory) and threads get SIGKILL after 5 seconds from fork(), being unable to reach do_exit() within 10 seconds is likely a sign of something went wrong. Therefore, I would like to try this patch in linux-next.git for feasibility testing whether this patch helps finding more bugs and reproducers for such bugs, by bringing "unable to terminate threads" reports out of "no output from test machine" reports. Potential bad effect of this patch will be that kernel code becomes killable without addressing the root cause of being unable to terminate, for use of killable wait will bypass both TASK_UNINTERRUPTIBLE stall test and SIGKILL after 5 seconds behavior, which will result in failing to detect in real systems where SIGKILL won't be sent after 5 seconds when something went wrong. This version shares existing sysctl settings (e.g. check interval, timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE threads. We will likely want to use different sysctl settings for monitoring killed threads. But let's start as linux-next.git patch without introducing new sysctl settings. We can add sysctl settings before sending to linux.git. Link: http://lkml.kernel.org/r/60d1d7f6-b201-3dcb-a51b-76a31bcfa919@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Petr Mladek Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Vitaly Kuznetsov Cc: Liu Chuansheng Cc: Valdis Kletnieks Cc: linux-kernel@vger.kernel.org Cc: Dmitry Vyukov Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched.h | 1 + kernel/hung_task.c | 44 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b36fe121da1d2..183a1e0e6fb30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -990,6 +990,7 @@ struct task_struct { #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; + unsigned long killed_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c767..8cc07e7f29aa1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -145,6 +145,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); } +static void check_killed_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long stamp = t->killed_time; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + /* + * Skip threads which are already inside do_exit(), for exit_mm() etc. + * might take many seconds. + */ + if (t->flags & PF_EXITING) + return; + if (!stamp) { + stamp = jiffies; + if (!stamp) + stamp++; + t->killed_time = stamp; + return; + } + if (time_is_after_jiffies(stamp + timeout * HZ)) + return; + trace_sched_process_hang(t); + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_call_panic = true; + } + /* + * This thread failed to terminate for more than + * sysctl_hung_task_timeout_secs seconds, complain: + */ + pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n", + t->comm, t->pid, (jiffies - stamp) / HZ); + sched_show_task(t); + hung_task_show_lock = true; + touch_nmi_watchdog(); +} + /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. @@ -196,6 +237,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } + /* Check threads which are about to terminate. */ + if (unlikely(fatal_signal_pending(t))) + check_killed_task(t, timeout); /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); From ed9b22b7e6ba14ac6a9cdc8c1ad82a1a0b4dddae Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Aug 2021 09:59:50 +1000 Subject: [PATCH 292/365] alpha: agp: make empty macros use do-while-0 style Copy these macros from ia64/include/asm/agp.h to avoid the "empty-body" in 'if' statment warning. drivers/char/agp/generic.c: In function 'agp_generic_destroy_page': ../drivers/char/agp/generic.c:1265:42: warning: suggest braces around empty body in an 'if' statement [-Wempty-body] 1265 | unmap_page_from_agp(page); Link: https://lkml.kernel.org/r/20210809030822.20658-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/include/asm/agp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h index 7173eada15678..7874f063d0001 100644 --- a/arch/alpha/include/asm/agp.h +++ b/arch/alpha/include/asm/agp.h @@ -6,8 +6,8 @@ /* dummy for now */ -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) +#define map_page_into_agp(page) do { } while (0) +#define unmap_page_from_agp(page) do { } while (0) #define flush_agp_cache() mb() /* GATT allocation. Returns/accepts GATT kernel virtual address. */ From c0d07417ad8a84a6173c78fac23a222aec35e2e0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Aug 2021 09:59:50 +1000 Subject: [PATCH 293/365] alpha: pci-sysfs: fix all kernel-doc warnings Fix all kernel-doc warnings in arch/alpha/kernel/pci-sysfs.c: ../arch/alpha/kernel/pci-sysfs.c:67: warning: No description found for return value of 'pci_mmap_resource' ../arch/alpha/kernel/pci-sysfs.c:115: warning: Function parameter or member 'pdev' not described in 'pci_remove_resource_files' ../arch/alpha/kernel/pci-sysfs.c:115: warning: Excess function parameter 'dev' description in 'pci_remove_resource_files' ../arch/alpha/kernel/pci-sysfs.c:230: warning: Function parameter or member 'pdev' not described in 'pci_create_resource_files' ../arch/alpha/kernel/pci-sysfs.c:230: warning: Excess function parameter 'dev' description in 'pci_create_resource_files' ../arch/alpha/kernel/pci-sysfs.c:232: warning: No description found for return value of 'pci_create_resource_files' ../arch/alpha/kernel/pci-sysfs.c:305: warning: Function parameter or member 'bus' not described in 'pci_adjust_legacy_attr' ../arch/alpha/kernel/pci-sysfs.c:305: warning: Excess function parameter 'b' description in 'pci_adjust_legacy_attr' Link: https://lkml.kernel.org/r/20210808185249.31442-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/kernel/pci-sysfs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 0021580d79adb..5808a66e2a81f 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -60,6 +60,8 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num, * @sparse: address space type * * Use the bus mapping routines to map a PCI resource into userspace. + * + * Return: %0 on success, negative error code otherwise */ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, @@ -106,7 +108,7 @@ static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj, /** * pci_remove_resource_files - cleanup resource files - * @dev: dev to cleanup + * @pdev: pci_dev to cleanup * * If we created resource files for @dev, remove them from sysfs and * free their resources. @@ -221,10 +223,12 @@ static int pci_create_attr(struct pci_dev *pdev, int num) } /** - * pci_create_resource_files - create resource files in sysfs for @dev - * @dev: dev in question + * pci_create_resource_files - create resource files in sysfs for @pdev + * @pdev: pci_dev in question * * Walk the resources in @dev creating files for each resource available. + * + * Return: %0 on success, or negative error code */ int pci_create_resource_files(struct pci_dev *pdev) { @@ -296,7 +300,7 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma, /** * pci_adjust_legacy_attr - adjustment of legacy file attributes - * @b: bus to create files under + * @bus: bus to create files under * @mmap_type: I/O port or memory * * Adjust file name and size for sparse mappings. From b77d0128f6404978d4472223bdecf65a7878652b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 24 Aug 2021 09:59:50 +1000 Subject: [PATCH 294/365] percpu: remove export of pcpu_base_addr This is not needed by any modules, so remove the export. Link: https://lkml.kernel.org/r/20210722185814.504541-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reviewed-by: Christoph Hellwig Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/percpu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index 7f2e0151c4e25..a43039629aa46 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -146,7 +146,6 @@ static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; -EXPORT_SYMBOL_GPL(pcpu_base_addr); static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ From 4b50d7a0ab0535e2c62f8f3e65a81c04da20bda2 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Tue, 24 Aug 2021 09:59:50 +1000 Subject: [PATCH 295/365] fs/proc/kcore.c: add mmap interface When we do the kernel monitor, use the DRGN (https://github.com/osandov/drgn) access to kernel data structures, found that the system calls a lot. DRGN is implemented by reading /proc/kcore. After looking at the kcore code, it is found that kcore does not implement mmap, resulting in frequent context switching triggered by read. Therefore, we want to add mmap interface to optimize performance. Since vmalloc and module areas will change with allocation and release, consistency cannot be guaranteed, so mmap interface only maps KCORE_TEXT and KCORE_RAM. The test results: 1. the default version of kcore real 11.00 user 8.53 sys 3.59 % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 99.64 128.578319 12 11168701 pread64 ... ------ ----------- ----------- --------- --------- ---------------- 100.00 129.042853 11193748 966 total 2. added kcore for the mmap interface real 6.44 user 7.32 sys 0.24 % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 32.94 0.130120 24 5317 315 futex 11.66 0.046077 21 2231 1 lstat 9.23 0.036449 177 206 mmap ... ------ ----------- ----------- --------- --------- ---------------- 100.00 0.395077 25435 971 total The test results show that the number of system calls and time consumption are significantly reduced. Link: https://lkml.kernel.org/r/20210704062208.7898-1-zhoufeng.zf@bytedance.com Co-developed-by: Ying Chen Signed-off-by: Ying Chen Signed-off-by: Feng Zhou Cc: Alexey Dobriyan Cc: Mike Rapoport Cc: Muchun Song Cc: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/kcore.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 982e694aae77d..3f148759a5fd1 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -614,11 +614,84 @@ static int release_kcore(struct inode *inode, struct file *file) return 0; } +static vm_fault_t mmap_kcore_fault(struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct kcore_mmap_ops = { + .fault = mmap_kcore_fault, +}; + +static int mmap_kcore(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, pfn; + int nphdr; + size_t data_offset; + size_t phdrs_len, notes_len; + struct kcore_list *m = NULL; + int ret = 0; + + down_read(&kclist_lock); + + get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); + + data_offset &= PAGE_MASK; + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + if (start < data_offset) { + ret = -EINVAL; + goto out; + } + start = kc_offset_to_vaddr(start - data_offset); + end = start + size; + + list_for_each_entry(m, &kclist_head, list) { + if (start >= m->addr && end <= m->addr + m->size) + break; + } + + if (&m->list == &kclist_head) { + ret = -EINVAL; + goto out; + } + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) { + ret = -EPERM; + goto out; + } + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &kcore_mmap_ops; + + if (kern_addr_valid(start)) { + if (m->type == KCORE_RAM) + pfn = __pa(start) >> PAGE_SHIFT; + else if (m->type == KCORE_TEXT) + pfn = __pa_symbol(start) >> PAGE_SHIFT; + else { + ret = -EFAULT; + goto out; + } + + ret = remap_pfn_range(vma, vma->vm_start, pfn, size, + vma->vm_page_prot); + } else { + ret = -EFAULT; + } + +out: + up_read(&kclist_lock); + return ret; +} + static const struct proc_ops kcore_proc_ops = { .proc_read = read_kcore, .proc_open = open_kcore, .proc_release = release_kcore, .proc_lseek = default_llseek, + .proc_mmap = mmap_kcore, }; /* just remember that we have to update kcore */ From ea82408797a634f5b1a5482ca38108487eb36393 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:59:51 +1000 Subject: [PATCH 296/365] proc: stop using seq_get_buf in proc_task_name Use seq_escape_str and seq_printf instead of poking holes into the seq_file abstraction. Link: https://lkml.kernel.org/r/20210810151945.1795567-1-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Christian Brauner Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/array.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index ee0ce8cecc4a7..49be8c8ef555e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -98,27 +98,17 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { - char *buf; - size_t size; char tcomm[64]; - int ret; if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); - size = seq_get_buf(m, &buf); - if (escape) { - ret = string_escape_str(tcomm, buf, size, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); - if (ret >= size) - ret = -1; - } else { - ret = strscpy(buf, tcomm, size); - } - - seq_commit(m, ret); + if (escape) + seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + else + seq_printf(m, "%.64s", tcomm); } /* From 3e2ab1ea4bc6ba7ef5b60adfa6b6a7dcd52495d4 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Tue, 24 Aug 2021 09:59:51 +1000 Subject: [PATCH 297/365] connector: send event on write to /proc/[pid]/comm While comm change event via prctl has been reported to proc connector by 'commit f786ecba4158 ("connector: add comm change event report to proc connector")', connector listeners were missing comm changes by explicit writes on /proc/[pid]/comm. Let explicit writes on /proc/[pid]/comm report to proc connector. Link: https://lkml.kernel.org/r/20210701133458epcms1p68e9eb9bd0eee8903ba26679a37d9d960@epcms1p6 Signed-off-by: Ohhoon Kwon Cc: Ingo Molnar Cc: David S. Miller Cc: Christian Brauner Cc: Eric W. Biederman Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index e5b5f7709d48f..533d5836eb9a4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -1674,8 +1675,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (same_thread_group(current, p)) + if (same_thread_group(current, p)) { set_task_comm(p, buffer); + proc_comm_connector(p); + } else count = -EINVAL; From 78fba7d75a77137bd3dae0296f76a2a47a9dd9d1 Mon Sep 17 00:00:00 2001 From: Julius Hemanth Pitti Date: Tue, 24 Aug 2021 09:59:51 +1000 Subject: [PATCH 298/365] proc/sysctl: make protected_* world readable protected_* files have 600 permissions which prevents non-superuser from reading them. Container like "AWS greengrass" refuse to launch unless protected_hardlinks and protected_symlinks are set. When containers like these run with "userns-remap" or "--user" mapping container's root to non-superuser on host, they fail to run due to denied read access to these files. As these protections are hardly a secret, and do not possess any security risk, making them world readable. Though above greengrass usecase needs read access to only protected_hardlinks and protected_symlinks files, setting all other protected_* files to 644 to keep consistency. Link: http://lkml.kernel.org/r/20200709235115.56954-1-jpitti@cisco.com Fixes: 800179c9b8a1 ("fs: add link restrictions") Signed-off-by: Julius Hemanth Pitti Acked-by: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Cc: Ingo Molnar Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 297f0b3966bd3..65bc6f713c665 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3275,7 +3275,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -3284,7 +3284,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -3293,7 +3293,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, @@ -3302,7 +3302,7 @@ static struct ctl_table fs_table[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, From a091be2b2ff4f537c21f9404c78ba271c19f1f63 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 24 Aug 2021 09:59:51 +1000 Subject: [PATCH 299/365] arch: Kconfig: fix spelling mistake "seperate" -> "separate" Threre is a spelling mistake in the Kconfig text. Fix it. Link: https://lkml.kernel.org/r/20210704095207.37342-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 1baac7bfdd441..4f7596092ad8f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -914,7 +914,7 @@ config HAVE_SOFTIRQ_ON_OWN_STACK bool help Architecture provides a function to run __do_softirq() on a - seperate stack. + separate stack. config PGTABLE_LEVELS int From f3c06663a7136bee7d22c8fa4b83427a62989b8a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 24 Aug 2021 09:59:51 +1000 Subject: [PATCH 300/365] include/linux/once.h: fix trivia typo Not -> Note Fix trivia typo Not -> Note in the comment to DO_ONCE(). Link: https://lkml.kernel.org/r/20210722184349.76290-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/once.h b/include/linux/once.h index ae6f4eb41cbe7..d361fb14ac3a2 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -16,7 +16,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, * out the condition into a nop. DO_ONCE() guarantees type safety of * arguments! * - * Not that the following is not equivalent ... + * Note that the following is not equivalent ... * * DO_ONCE(func, arg); * DO_ONCE(func, arg); From ba7f0e65b60477f9f7bcc7660d977e3ccf2b2170 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:52 +1000 Subject: [PATCH 301/365] units: change from 'L' to 'UL' Patch series "Add Hz macros", v3. There are multiple definitions of the HZ_PER_MHZ or HZ_PER_KHZ in the different drivers. Instead of duplicating this definition again and again, add one in the units.h header to be reused in all the place the redefiniton occurs. At the same time, change the type of the Watts, as they can not be negative. This patch (of 10): The users of the macros are safe to be assigned with an unsigned instead of signed as the variables using them are themselves unsigned. Link: https://lkml.kernel.org/r/20210816114732.1834145-1-daniel.lezcano@linaro.org Link: https://lkml.kernel.org/r/20210816114732.1834145-2-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Cc: Andy Shevchenko Cc: Jonathan Cameron Cc: Christian Eggers Cc: Lukasz Luba Cc: MyungJoo Ham Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Peter Meerwald Cc: Zhang Rui Cc: Guenter Roeck Cc: Miquel Raynal Cc: Maxime Coquelin Cc: "Rafael J. Wysocki" Cc: Daniel Lezcano Cc: Chanwoo Choi Cc: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/units.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/units.h b/include/linux/units.h index dcc30a53fa939..ff51d3cfc6a0e 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,9 +4,9 @@ #include -#define MILLIWATT_PER_WATT 1000L -#define MICROWATT_PER_MILLIWATT 1000L -#define MICROWATT_PER_WATT 1000000L +#define MILLIWATT_PER_WATT 1000UL +#define MICROWATT_PER_MILLIWATT 1000UL +#define MICROWATT_PER_WATT 1000000UL #define ABSOLUTE_ZERO_MILLICELSIUS -273150 From a297177e577a9b07db4bd80d75ce9b562be42903 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:52 +1000 Subject: [PATCH 302/365] units: add the HZ macros The macros for the unit conversion for frequency are duplicated in different places. Provide these macros in the 'units' header, so they can be reused. Link: https://lkml.kernel.org/r/20210816114732.1834145-3-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/units.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/units.h b/include/linux/units.h index ff51d3cfc6a0e..8b8dc8a84d93c 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,6 +4,10 @@ #include +#define HZ_PER_KHZ 1000UL +#define KHZ_PER_MHZ 1000UL +#define HZ_PER_MHZ 1000000UL + #define MILLIWATT_PER_WATT 1000UL #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL From bb5e0747600abd35df85412c13d28773bf3250ae Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:52 +1000 Subject: [PATCH 303/365] thermal/drivers/devfreq_cooling: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. The new macro uses a unsigned long type which is already the type in the current code via the 'freq' variable. Link: https://lkml.kernel.org/r/20210816114732.1834145-4-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Andy Shevchenko Reviewed-by: Christian Eggers Cc: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/thermal/devfreq_cooling.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index 5a86cffd78f61..4310cb342a9fb 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -18,10 +18,10 @@ #include #include #include +#include #include -#define HZ_PER_KHZ 1000 #define SCALE_ERROR_MITIGATION 100 /** From 43e3ae774d9511270af74a8b7b9652fe91c86f8c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:52 +1000 Subject: [PATCH 304/365] devfreq: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. The new macro has an unsigned long type. All the code is dealing with unsigned long and the code using the macro is doing a coercitive cast to unsigned long. Link: https://lkml.kernel.org/r/20210816114732.1834145-5-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Acked-by: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/devfreq/devfreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 28f3e0ba6cdd9..85faa7a5c7d12 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "governor.h" #define CREATE_TRACE_POINTS @@ -34,7 +35,6 @@ #define IS_SUPPORTED_FLAG(f, name) ((f & DEVFREQ_GOV_FLAG_##name) ? true : false) #define IS_SUPPORTED_ATTR(f, name) ((f & DEVFREQ_GOV_ATTR_##name) ? true : false) -#define HZ_PER_KHZ 1000 static struct class *devfreq_class; static struct dentry *devfreq_debugfs; From 50c528e559fe79f698615ea3afe3affc4a04dc3d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:52 +1000 Subject: [PATCH 305/365] iio/drivers/as73211: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. Link: https://lkml.kernel.org/r/20210816114732.1834145-6-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Acked-by: Jonathan Cameron Cc: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/iio/light/as73211.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/light/as73211.c b/drivers/iio/light/as73211.c index 7b32dfaee9b32..3ba2378df3ddb 100644 --- a/drivers/iio/light/as73211.c +++ b/drivers/iio/light/as73211.c @@ -24,8 +24,7 @@ #include #include #include - -#define HZ_PER_KHZ 1000 +#include #define AS73211_DRV_NAME "as73211" From 792b27a95ebda89fed11fe2864431c58130019b0 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:52 +1000 Subject: [PATCH 306/365] hwmon/drivers/mr75203: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. The new macro is an unsigned long. The code dealing with it is considering as an unsigned long also. Link: https://lkml.kernel.org/r/20210816114732.1834145-7-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Acked-by: Guenter Roeck Cc: Chanwoo Choi Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/hwmon/mr75203.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/mr75203.c b/drivers/hwmon/mr75203.c index 18da5a25e89ab..868243dba1ee0 100644 --- a/drivers/hwmon/mr75203.c +++ b/drivers/hwmon/mr75203.c @@ -17,6 +17,7 @@ #include #include #include +#include /* PVT Common register */ #define PVT_IP_CONFIG 0x04 @@ -37,7 +38,6 @@ #define CLK_SYNTH_EN BIT(24) #define CLK_SYS_CYCLES_MAX 514 #define CLK_SYS_CYCLES_MIN 2 -#define HZ_PER_MHZ 1000000L #define SDIF_DISABLE 0x04 From 000babd7a2bd9da652a822c0a57b6107d58d3f63 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:53 +1000 Subject: [PATCH 307/365] iio/drivers/hid-sensor: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. Link: https://lkml.kernel.org/r/20210816114732.1834145-8-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Andy Shevchenko Acked-by: Jonathan Cameron Cc: Chanwoo Choi Cc: Christian Eggers Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/iio/common/hid-sensors/hid-sensor-attributes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c index 043f199e7bc62..9b279937a24e0 100644 --- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c +++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c @@ -6,12 +6,11 @@ #include #include #include +#include #include #include -#define HZ_PER_MHZ 1000000L - static struct { u32 usage_id; int unit; /* 0 for default others from HID sensor spec */ From 12fa3d0692c479ae5d64b6944437872dc5068d03 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:53 +1000 Subject: [PATCH 308/365] i2c/drivers/ov02q10: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. Link: https://lkml.kernel.org/r/20210816114732.1834145-9-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Christian Eggers Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/media/i2c/ov02a10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/i2c/ov02a10.c b/drivers/media/i2c/ov02a10.c index a3ce5500d3551..0f08c05333ea1 100644 --- a/drivers/media/i2c/ov02a10.c +++ b/drivers/media/i2c/ov02a10.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -64,7 +65,6 @@ /* Test pattern control */ #define OV02A10_REG_TEST_PATTERN 0xb6 -#define HZ_PER_MHZ 1000000L #define OV02A10_LINK_FREQ_390MHZ (390 * HZ_PER_MHZ) #define OV02A10_ECLK_FREQ (24 * HZ_PER_MHZ) From af1969db0f3832262a866323634c59d01209b0e4 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:53 +1000 Subject: [PATCH 309/365] mtd/drivers/nand: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. Link: https://lkml.kernel.org/r/20210816114732.1834145-10-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Acked-by: Miquel Raynal Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Christian Eggers Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/mtd/nand/raw/intel-nand-controller.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/intel-nand-controller.c b/drivers/mtd/nand/raw/intel-nand-controller.c index 8b49fd56cf964..709f0402fbae8 100644 --- a/drivers/mtd/nand/raw/intel-nand-controller.c +++ b/drivers/mtd/nand/raw/intel-nand-controller.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #define EBU_CLC 0x000 @@ -102,7 +103,6 @@ #define MAX_CS 2 -#define HZ_PER_MHZ 1000000L #define USEC_PER_SEC 1000000L struct ebu_nand_cs { From 4c7ed330e62fc107fa2a01020f19c2012baacf1b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Aug 2021 09:59:53 +1000 Subject: [PATCH 310/365] phy/drivers/stm32: use HZ macros HZ unit conversion macros are available in units.h, use them and remove the duplicate definition. Link: https://lkml.kernel.org/r/20210816114732.1834145-11-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Christian Eggers Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/phy/st/phy-stm32-usbphyc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/st/phy-stm32-usbphyc.c b/drivers/phy/st/phy-stm32-usbphyc.c index 3e491dfb25250..937a14fa7448a 100644 --- a/drivers/phy/st/phy-stm32-usbphyc.c +++ b/drivers/phy/st/phy-stm32-usbphyc.c @@ -15,6 +15,7 @@ #include #include #include +#include #define STM32_USBPHYC_PLL 0x0 #define STM32_USBPHYC_MISC 0x8 @@ -47,7 +48,6 @@ #define PLL_FVCO_MHZ 2880 #define PLL_INFF_MIN_RATE_HZ 19200000 #define PLL_INFF_MAX_RATE_HZ 38400000 -#define HZ_PER_MHZ 1000000L struct pll_params { u8 ndiv; From 3693e9621222298ace0461c5078b178ef385cde5 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 24 Aug 2021 09:59:53 +1000 Subject: [PATCH 311/365] kernel/acct.c: use dedicated helper to access rlimit values Use rlimit() helper instead of manually writing whole chain from task to rlimit value. See patch "posix-cpu-timers: Use dedicated helper to access rlimit values". Link: https://lkml.kernel.org/r/20210728030822.524789-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Randy Dunlap Cc: sh_def@163.com Cc: Yang Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/acct.c b/kernel/acct.c index a64102be2bb06..23a7ab8e6cbc8 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -478,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* * Accounting records are not subject to resource limits. */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + flim = rlimit(RLIMIT_FSIZE); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); From 3944869d304bb1fedc7f7eb56e9376826524badf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 24 Aug 2021 09:59:54 +1000 Subject: [PATCH 312/365] math: make RATIONAL tristate Patch series "math: RATIONAL and RATIONAL_KUNIT_TEST improvements". This series makes the RATIONAL symbol tristate, so it is not forced builtin if all users are modular, and makes the RATIONAL_KUNIT_TEST depend on RATIONAL, to avoid enabling RATIONAL if there are no real users. This patch (of 2): All but one symbols that select RATIONAL are tristate, but RATIONAL itself is bool. Change it to tristate, so the rational fractions support code can be modular if no builtin code relies on it. Link: https://lkml.kernel.org/r/20210706100945.3803694-1-geert@linux-m68k.org Link: https://lkml.kernel.org/r/20210706100945.3803694-2-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Reviewed-by: Andy Shevchenko Cc: Trent Piepho Cc: Colin Ian King Cc: Brendan Higgins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/math/Kconfig | 2 +- lib/math/rational.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/math/Kconfig b/lib/math/Kconfig index f19bc9734fa7c..0634b428d0cb7 100644 --- a/lib/math/Kconfig +++ b/lib/math/Kconfig @@ -14,4 +14,4 @@ config PRIME_NUMBERS If unsure, say N. config RATIONAL - bool + tristate diff --git a/lib/math/rational.c b/lib/math/rational.c index c0ab51d8fbb98..ec59d426ea638 100644 --- a/lib/math/rational.c +++ b/lib/math/rational.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * calculate best rational approximation for a given fraction @@ -106,3 +107,5 @@ void rational_best_approximation( } EXPORT_SYMBOL(rational_best_approximation); + +MODULE_LICENSE("GPL v2"); From abf80b1710e0f1c4d8c065e03f513f4736a83623 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 24 Aug 2021 09:59:54 +1000 Subject: [PATCH 313/365] math: RATIONAL_KUNIT_TEST should depend on RATIONAL instead of selecting it RATIONAL_KUNIT_TEST selects RATIONAL, thus enabling an optional feature the user may not want to have enabled. Fix this by making the test depend on RATIONAL instead. Link: https://lkml.kernel.org/r/20210706100945.3803694-3-geert@linux-m68k.org Fixes: b6c75c4afceb8bc0 ("lib/math/rational: add Kunit test cases") Signed-off-by: Geert Uytterhoeven Cc: Andy Shevchenko Cc: Brendan Higgins Cc: Colin Ian King Cc: Trent Piepho Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.debug | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b91d8239c5ad2..2193f8f9fd8f6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2466,8 +2466,7 @@ config SLUB_KUNIT_TEST config RATIONAL_KUNIT_TEST tristate "KUnit test for rational.c" if !KUNIT_ALL_TESTS - depends on KUNIT - select RATIONAL + depends on KUNIT && RATIONAL default KUNIT_ALL_TESTS help This builds the rational math unit test. From c8707e54528c9e52b4b300ad41521401fea607ac Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 24 Aug 2021 09:59:54 +1000 Subject: [PATCH 314/365] lib/string: optimized memcpy Patch series "lib/string: optimized mem* functions", v2. Rewrite the generic mem{cpy,move,set} so that memory is accessed with the widest size possible, but without doing unaligned accesses. This was originally posted as C string functions for RISC-V[1], but as there was no specific RISC-V code, it was proposed for the generic lib/string.c implementation. Tested on RISC-V and on x86_64 by undefining __HAVE_ARCH_MEM{CPY,SET,MOVE} and HAVE_EFFICIENT_UNALIGNED_ACCESS. These are the performances of memcpy() and memset() of a RISC-V machine on a 32 mbyte buffer: memcpy: original aligned: 75 Mb/s original unaligned: 75 Mb/s new aligned: 114 Mb/s new unaligned: 107 Mb/s memset: original aligned: 140 Mb/s original unaligned: 140 Mb/s new aligned: 241 Mb/s new unaligned: 241 Mb/s The size increase is negligible: $ scripts/bloat-o-meter vmlinux.orig vmlinux add/remove: 0/0 grow/shrink: 4/1 up/down: 427/-6 (421) Function old new delta memcpy 29 351 +322 memset 29 117 +88 strlcat 68 78 +10 strlcpy 50 57 +7 memmove 56 50 -6 Total: Before=8556964, After=8557385, chg +0.00% These functions will be used for RISC-V initially. [1] https://lore.kernel.org/linux-riscv/20210617152754.17960-1-mcroce@linux.microsoft.com/ The only architecture which will use all the three function will be riscv, while memmove() will be used by arc, h8300, hexagon, ia64, openrisc and parisc. Keep in mind that memmove() isn't anything special, it just calls memcpy() when possible (e.g. buffers not overlapping), and fallbacks to the byte by byte copy otherwise. In future we can write two functions, one which copies forward and another one which copies backward, and call the right one depending on the buffers position. Then, we could alias memcpy() and memmove(), as proposed by Linus: https://bugzilla.redhat.com/show_bug.cgi?id=638477#c132 This patch (of 3): Rewrite the generic memcpy() to copy a word at time, without generating unaligned accesses. The procedure is made of three steps: First copy data one byte at time until the destination buffer is aligned to a long boundary. Then copy the data one long at time shifting the current and the next long to compose a long at every cycle. Finally, copy the remainder one byte at time. This is the improvement on RISC-V: original aligned: 75 Mb/s original unaligned: 75 Mb/s new aligned: 114 Mb/s new unaligned: 107 Mb/s and this the binary size increase according to bloat-o-meter: Function old new delta memcpy 36 324 +288 Link: https://lkml.kernel.org/r/20210702123153.14093-1-mcroce@linux.microsoft.com Link: https://lkml.kernel.org/r/20210702123153.14093-2-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Cc: Nick Kossifidis Cc: Guo Ren Cc: Christoph Hellwig Cc: David Laight Cc: Palmer Dabbelt Cc: Emil Renner Berthing Cc: Drew Fustini Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/string.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/lib/string.c b/lib/string.c index 77bd0b1d32967..1053044883197 100644 --- a/lib/string.c +++ b/lib/string.c @@ -33,6 +33,23 @@ #include #include +#define BYTES_LONG sizeof(long) +#define WORD_MASK (BYTES_LONG - 1) +#define MIN_THRESHOLD (BYTES_LONG * 2) + +/* convenience union to avoid cast between different pointer types */ +union types { + u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +union const_types { + const u8 *as_u8; + const unsigned long *as_ulong; + uintptr_t as_uptr; +}; + #ifndef __HAVE_ARCH_STRNCASECMP /** * strncasecmp - Case insensitive, length-limited string comparison @@ -869,6 +886,13 @@ EXPORT_SYMBOL(memset64); #endif #ifndef __HAVE_ARCH_MEMCPY + +#ifdef __BIG_ENDIAN +#define MERGE_UL(h, l, d) ((h) << ((d) * 8) | (l) >> ((BYTES_LONG - (d)) * 8)) +#else +#define MERGE_UL(h, l, d) ((h) >> ((d) * 8) | (l) << ((BYTES_LONG - (d)) * 8)) +#endif + /** * memcpy - Copy one area of memory to another * @dest: Where to copy to @@ -880,14 +904,64 @@ EXPORT_SYMBOL(memset64); */ void *memcpy(void *dest, const void *src, size_t count) { - char *tmp = dest; - const char *s = src; + union const_types s = { .as_u8 = src }; + union types d = { .as_u8 = dest }; + int distance = 0; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + if (count < MIN_THRESHOLD) + goto copy_remainder; + + /* Copy a byte at time until destination is aligned. */ + for (; d.as_uptr & WORD_MASK; count--) + *d.as_u8++ = *s.as_u8++; + + distance = s.as_uptr & WORD_MASK; + } + + if (distance) { + unsigned long last, next; + /* + * s is distance bytes ahead of d, and d just reached + * the alignment boundary. Move s backward to word align it + * and shift data to compensate for distance, in order to do + * word-by-word copy. + */ + s.as_u8 -= distance; + + next = s.as_ulong[0]; + for (; count >= BYTES_LONG; count -= BYTES_LONG) { + last = next; + next = s.as_ulong[1]; + + d.as_ulong[0] = MERGE_UL(last, next, distance); + + d.as_ulong++; + s.as_ulong++; + } + + /* Restore s with the original offset. */ + s.as_u8 += distance; + } else { + /* + * If the source and dest lower bits are the same, do a simple + * 32/64 bit wide copy. + */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *d.as_ulong++ = *s.as_ulong++; + } + +copy_remainder: while (count--) - *tmp++ = *s++; + *d.as_u8++ = *s.as_u8++; + return dest; } EXPORT_SYMBOL(memcpy); + +#undef MERGE_UL + #endif #ifndef __HAVE_ARCH_MEMMOVE From bd6e8357c5a271808431dec66d0ed39a3bd891c8 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 24 Aug 2021 09:59:54 +1000 Subject: [PATCH 315/365] lib/string: optimized memmove When the destination buffer is before the source one, or when the buffers doesn't overlap, it's safe to use memcpy() instead, which is optimized to use a bigger data size possible. This "optimization" only covers a common case. In future, proper code which does the same thing as memcpy() does but backwards can be done. Link: https://lkml.kernel.org/r/20210702123153.14093-3-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Cc: Christoph Hellwig Cc: David Laight Cc: Drew Fustini Cc: Emil Renner Berthing Cc: Guo Ren Cc: Nick Desaulniers Cc: Nick Kossifidis Cc: Palmer Dabbelt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/string.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/lib/string.c b/lib/string.c index 1053044883197..4d3c589897e2b 100644 --- a/lib/string.c +++ b/lib/string.c @@ -975,19 +975,13 @@ EXPORT_SYMBOL(memcpy); */ void *memmove(void *dest, const void *src, size_t count) { - char *tmp; - const char *s; + if (dest < src || src + count <= dest) + return memcpy(dest, src, count); + + if (dest > src) { + const char *s = src + count; + char *tmp = dest + count; - if (dest <= src) { - tmp = dest; - s = src; - while (count--) - *tmp++ = *s++; - } else { - tmp = dest; - tmp += count; - s = src; - s += count; while (count--) *--tmp = *--s; } From 371f85776350be9bc4807ae5a367195f9d43a002 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 24 Aug 2021 09:59:54 +1000 Subject: [PATCH 316/365] lib/string: optimized memset The generic memset is defined as a byte at time write. This is always safe, but it's slower than a 4 byte or even 8 byte write. Write a generic memset which fills the data one byte at time until the destination is aligned, then fills using the largest size allowed, and finally fills the remaining data one byte at time. On a RISC-V machine the speed goes from 140 Mb/s to 241 Mb/s, and this the binary size increase according to bloat-o-meter: Function old new delta memset 32 148 +116 Link: https://lkml.kernel.org/r/20210702123153.14093-4-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Cc: Christoph Hellwig Cc: David Laight Cc: Drew Fustini Cc: Emil Renner Berthing Cc: Guo Ren Cc: Nick Desaulniers Cc: Nick Kossifidis Cc: Palmer Dabbelt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/string.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/lib/string.c b/lib/string.c index 4d3c589897e2b..4fec38fc6e58d 100644 --- a/lib/string.c +++ b/lib/string.c @@ -810,10 +810,38 @@ EXPORT_SYMBOL(__sysfs_match_string); */ void *memset(void *s, int c, size_t count) { - char *xs = s; + union types dest = { .as_u8 = s }; + if (count >= MIN_THRESHOLD) { + unsigned long cu = (unsigned long)c; + + /* Compose an ulong with 'c' repeated 4/8 times */ +#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER + cu *= 0x0101010101010101UL; +#else + cu |= cu << 8; + cu |= cu << 16; + /* Suppress warning on 32 bit machines */ + cu |= (cu << 16) << 16; +#endif + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + /* + * Fill the buffer one byte at time until + * the destination is word aligned. + */ + for (; count && dest.as_uptr & WORD_MASK; count--) + *dest.as_u8++ = c; + } + + /* Copy using the largest size allowed */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *dest.as_ulong++ = cu; + } + + /* copy the remainder */ while (count--) - *xs++ = c; + *dest.as_u8++ = c; + return s; } EXPORT_SYMBOL(memset); From 3908d29091aea31419d86e1907bd3450c7c88577 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Tue, 24 Aug 2021 09:59:55 +1000 Subject: [PATCH 317/365] lib/test: convert test_sort.c to use KUnit This follows up commit ebd09577be6c ("lib/test: convert lib/test_list_sort.c to use KUnit"). Converting this test to KUnit makes the test a bit shorter, standardizes how it reports pass/fail, and adds an easier way to run the test [1]. Like ebd09577be6c, this leaves the file and Kconfig option name the same, but slightly changes their dependencies (needs CONFIG_KUNIT). [1] Can be run via $ ./tools/testing/kunit/kunit.py run --kunitconfig /dev/stdin < Cc: Pravin Shedge Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.debug | 5 +++-- lib/test_sort.c | 40 +++++++++++++++++++--------------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2193f8f9fd8f6..ea630a43d3de2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2084,8 +2084,9 @@ config TEST_MIN_HEAP If unsure, say N. config TEST_SORT - tristate "Array-based sort test" - depends on DEBUG_KERNEL || m + tristate "Array-based sort test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS help This option enables the self-test function of 'sort()' at boot, or at module load time. diff --git a/lib/test_sort.c b/lib/test_sort.c index 52edbe10f2e51..be02e3a098cf5 100644 --- a/lib/test_sort.c +++ b/lib/test_sort.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only + +#include + #include #include #include @@ -7,18 +10,17 @@ #define TEST_LEN 1000 -static int __init cmpint(const void *a, const void *b) +static int cmpint(const void *a, const void *b) { return *(int *)a - *(int *)b; } -static int __init test_sort_init(void) +static void test_sort(struct kunit *test) { - int *a, i, r = 1, err = -ENOMEM; + int *a, i, r = 1; - a = kmalloc_array(TEST_LEN, sizeof(*a), GFP_KERNEL); - if (!a) - return err; + a = kunit_kmalloc_array(test, TEST_LEN, sizeof(*a), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a); for (i = 0; i < TEST_LEN; i++) { r = (r * 725861) % 6599; @@ -27,24 +29,20 @@ static int __init test_sort_init(void) sort(a, TEST_LEN, sizeof(*a), cmpint, NULL); - err = -EINVAL; for (i = 0; i < TEST_LEN-1; i++) - if (a[i] > a[i+1]) { - pr_err("test has failed\n"); - goto exit; - } - err = 0; - pr_info("test passed\n"); -exit: - kfree(a); - return err; + KUNIT_ASSERT_LE(test, a[i], a[i + 1]); } -static void __exit test_sort_exit(void) -{ -} +static struct kunit_case sort_test_cases[] = { + KUNIT_CASE(test_sort), + {} +}; + +static struct kunit_suite sort_test_suite = { + .name = "lib_sort", + .test_cases = sort_test_cases, +}; -module_init(test_sort_init); -module_exit(test_sort_exit); +kunit_test_suites(&sort_test_suite); MODULE_LICENSE("GPL"); From 6682715d704d63bbaad1ff1a9b6f01c942bc8a2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Aug 2021 09:59:55 +1000 Subject: [PATCH 318/365] lib/dump_stack: correct kernel-doc notation Fix kernel-doc warnings in dump_stack.c: lib/dump_stack.c:97: warning: Function parameter or member 'log_lvl' not described in 'dump_stack_lvl' lib/dump_stack.c:97: warning: expecting prototype for dump_stack(). Prototype was for dump_stack_lvl() instead Link: https://lkml.kernel.org/r/20210809051643.17567-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/dump_stack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/dump_stack.c b/lib/dump_stack.c index cd3387bb34e56..6b7f1bf6715d8 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -89,7 +89,8 @@ static void __dump_stack(const char *log_lvl) } /** - * dump_stack - dump the current task information and its stack trace + * dump_stack_lvl - dump the current task information and its stack trace + * @log_lvl: log level * * Architectures can override this implementation by implementing its own. */ From d9f6e28c8caf18f54d6a23ef288f716198eda993 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Aug 2021 09:59:55 +1000 Subject: [PATCH 319/365] lib/iov_iter.c: fix kernel-doc warnings Fix all kernel-doc warnings in lib/iov_iter.c: lib/iov_iter.c:695: warning: Function parameter or member 'i' not described in '_copy_mc_to_iter' lib/iov_iter.c:695: warning: Excess function parameter 'iter' description in '_copy_mc_to_iter' lib/iov_iter.c:695: warning: No description found for return value of '_copy_mc_to_iter' lib/iov_iter.c:758: warning: Function parameter or member 'i' not described in '_copy_from_iter_flushcache' lib/iov_iter.c:758: warning: Excess function parameter 'iter' description in '_copy_from_iter_flushcache' lib/iov_iter.c:758: warning: No description found for return value of '_copy_from_iter_flushcache' Link: https://lkml.kernel.org/r/20210809051053.6531-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/iov_iter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index e23123ae3a137..f2d50d69a6c36 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -672,7 +672,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length - * @iter: destination iterator + * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the @@ -690,6 +690,8 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. * Compare to copy_to_iter() where only ITER_IOVEC attempts might return * a short copy. + * + * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { @@ -744,7 +746,7 @@ EXPORT_SYMBOL(_copy_from_iter_nocache); * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length - * @iter: source iterator + * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory @@ -753,6 +755,8 @@ EXPORT_SYMBOL(_copy_from_iter_nocache); * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. + * + * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { From e296d831fe2e5a439d52e7ccea56039e719c7d22 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:55 +1000 Subject: [PATCH 320/365] bitops: protect find_first_{,zero}_bit properly Patch series "Resend bitmap patches". This patch (of 17): find_first_bit() and find_first_zero_bit() are not protected with ifdefs as other functions in find.h. It causes build errors on some platforms if CONFIG_GENERIC_FIND_FIRST_BIT is enabled. Link: https://lkml.kernel.org/r/20210814211713.180533-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20210814211713.180533-2-yury.norov@gmail.com Fixes: 2cc7b6a44ac2 ("lib: add fast path for find_first_*_bit() and find_last_bit()") Signed-off-by: Yury Norov Reported-by: kernel test robot Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/asm-generic/bitops/find.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 0d132ee2a2913..835f959a25f25 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -97,6 +97,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, #ifdef CONFIG_GENERIC_FIND_FIRST_BIT +#ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at @@ -116,7 +117,9 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) return _find_first_bit(addr, size); } +#endif +#ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at @@ -136,6 +139,8 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) return _find_first_zero_bit(addr, size); } +#endif + #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #ifndef find_first_bit From a9755b5caf734f01f8c552f942c6864112f0ae78 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:55 +1000 Subject: [PATCH 321/365] bitops: move find_bit_*_le functions from le.h to find.h It's convenient to have all find_bit declarations in one place. Link: https://lkml.kernel.org/r/20210814211713.180533-3-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/asm-generic/bitops/find.h | 69 +++++++++++++++++++++++++++++++ include/asm-generic/bitops/le.h | 64 ---------------------------- 2 files changed, 69 insertions(+), 64 deletions(-) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 835f959a25f25..91b1b23f2b0c6 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -190,4 +190,73 @@ extern unsigned long find_next_clump8(unsigned long *clump, #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) +#if defined(__LITTLE_ENDIAN) + +static inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_zero_bit(addr, size, offset); +} + +static inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) +{ + return find_first_zero_bit(addr, size); +} + +#elif defined(__BIG_ENDIAN) + +#ifndef find_next_zero_bit_le +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} +#endif + +#ifndef find_next_bit_le +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} +#endif + +#ifndef find_first_zero_bit_le +#define find_first_zero_bit_le(addr, size) \ + find_next_zero_bit_le((addr), (size), 0) +#endif + +#else +#error "Please fix " +#endif + #endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 5a28629cbf4d4..d51beff603755 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -2,83 +2,19 @@ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ -#include #include #include -#include #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 -static inline unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_zero_bit(addr, size, offset); -} - -static inline unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_bit(addr, size, offset); -} - -static inline unsigned long find_first_zero_bit_le(const void *addr, - unsigned long size) -{ - return find_first_zero_bit(addr, size); -} - #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) -#ifndef find_next_zero_bit_le -static inline -unsigned long find_next_zero_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) | ~GENMASK(size - 1, offset); - return val == ~0UL ? size : ffz(val); - } - - return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); -} -#endif - -#ifndef find_next_bit_le -static inline -unsigned long find_next_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr, NULL, size, offset, 0UL, 1); -} #endif -#ifndef find_first_zero_bit_le -#define find_first_zero_bit_le(addr, size) \ - find_next_zero_bit_le((addr), (size), 0) -#endif - -#else -#error "Please fix " -#endif static inline int test_bit_le(int nr, const void *addr) { From 821cadbf540b7a70cde9b881ed0415ce5117fa42 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:56 +1000 Subject: [PATCH 322/365] include: move find.h from asm_generic to linux find_bit API and bitmap API are closely related, but inclusion paths are different - include/asm-generic and include/linux, correspondingly. In the past it made a lot of troubles due to circular dependencies and/or undefined symbols. Fix this by moving find.h under include/linux. Link: https://lkml.kernel.org/r/20210814211713.180533-4-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- MAINTAINERS | 2 +- arch/alpha/include/asm/bitops.h | 2 -- arch/arc/include/asm/bitops.h | 1 - arch/arm/include/asm/bitops.h | 1 - arch/arm64/include/asm/bitops.h | 1 - arch/csky/include/asm/bitops.h | 1 - arch/h8300/include/asm/bitops.h | 1 - arch/hexagon/include/asm/bitops.h | 1 - arch/ia64/include/asm/bitops.h | 2 -- arch/m68k/include/asm/bitops.h | 2 -- arch/mips/include/asm/bitops.h | 1 - arch/openrisc/include/asm/bitops.h | 1 - arch/parisc/include/asm/bitops.h | 2 -- arch/powerpc/include/asm/bitops.h | 2 -- arch/riscv/include/asm/bitops.h | 1 - arch/s390/include/asm/bitops.h | 1 - arch/sh/include/asm/bitops.h | 1 - arch/sparc/include/asm/bitops_32.h | 1 - arch/sparc/include/asm/bitops_64.h | 2 -- arch/x86/include/asm/bitops.h | 2 -- arch/xtensa/include/asm/bitops.h | 1 - include/asm-generic/bitops.h | 1 - include/linux/bitmap.h | 1 + include/{asm-generic/bitops => linux}/find.h | 12 +++++++++--- 24 files changed, 11 insertions(+), 32 deletions(-) rename include/{asm-generic/bitops => linux}/find.h (97%) diff --git a/MAINTAINERS b/MAINTAINERS index ee6d3088e955b..2c2a611969863 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3262,8 +3262,8 @@ M: Yury Norov R: Andy Shevchenko R: Rasmus Villemoes S: Maintained -F: include/asm-generic/bitops/find.h F: include/linux/bitmap.h +F: include/linux/find.h F: lib/bitmap.c F: lib/find_bit.c F: lib/find_bit_benchmark.c diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index 5adca78830b5e..e1d8483a45f2e 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -430,8 +430,6 @@ static inline unsigned int __arch_hweight8(unsigned int w) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ /* diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index fb98440c0bd4c..0cc24ae6a8d99 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -369,7 +369,6 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x) #include #include -#include #include #include diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index c92e42a5c8f75..8e94fe7ab5ebc 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -264,7 +264,6 @@ static inline int find_next_bit_le(const void *p, int size, int offset) #endif -#include #include /* diff --git a/arch/arm64/include/asm/bitops.h b/arch/arm64/include/asm/bitops.h index 81a3e519b07db..9b3c787132d22 100644 --- a/arch/arm64/include/asm/bitops.h +++ b/arch/arm64/include/asm/bitops.h @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 91818787d8609..9604f47bd8503 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -59,7 +59,6 @@ static __always_inline unsigned long __fls(unsigned long x) #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h index c867a80cab5b5..4489e3d6edd3d 100644 --- a/arch/h8300/include/asm/bitops.h +++ b/arch/h8300/include/asm/bitops.h @@ -168,7 +168,6 @@ static inline unsigned long __ffs(unsigned long word) return result; } -#include #include #include #include diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 71429f756af0f..75d6ba3643b8d 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -271,7 +271,6 @@ static inline unsigned long __fls(unsigned long word) } #include -#include #include #include diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index 2f24ee6459d20..577be93c0818c 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -441,8 +441,6 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 7b414099e5fc2..f551a21602949 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -529,6 +529,4 @@ static inline int __fls(int x) #include #endif /* __KERNEL__ */ -#include - #endif /* _M68K_BITOPS_H */ diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index dc2a6234dd3c7..c09d57f907f7e 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -446,7 +446,6 @@ static inline int ffs(int word) } #include -#include #ifdef __KERNEL__ diff --git a/arch/openrisc/include/asm/bitops.h b/arch/openrisc/include/asm/bitops.h index 7f1ca35213d8c..d773ed938acb4 100644 --- a/arch/openrisc/include/asm/bitops.h +++ b/arch/openrisc/include/asm/bitops.h @@ -30,7 +30,6 @@ #include #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index aa4e883431c1a..c7a9997ac9cbb 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -208,8 +208,6 @@ static __inline__ int fls(unsigned int x) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 299ab33505a6c..ce2c1fa1a45db 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -255,8 +255,6 @@ unsigned long __arch_hweight64(__u64 w); #include #endif -#include - /* wrappers that deal with KASAN instrumentation */ #include #include diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 396a3303c5374..3540b690944be 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index fd149480b6e2b..f7cefdde7c24f 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -387,7 +387,6 @@ static inline int fls(unsigned int word) #endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ #include -#include #include #include #include diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 3b6c7b5b7ec9f..10ceb0d6b5a99 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -68,6 +68,5 @@ static inline unsigned long __ffs(unsigned long word) #include #include -#include #endif /* __ASM_SH_BITOPS_H */ diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h index 0ceff3b915a89..889afa9f990ff 100644 --- a/arch/sparc/include/asm/bitops_32.h +++ b/arch/sparc/include/asm/bitops_32.h @@ -100,7 +100,6 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr) #include #include #include -#include #include #include diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h index ca7ea5913494f..005a8ae858f16 100644 --- a/arch/sparc/include/asm/bitops_64.h +++ b/arch/sparc/include/asm/bitops_64.h @@ -52,8 +52,6 @@ unsigned int __arch_hweight8(unsigned int w); #include #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 0367efdc5b7a8..a288ecd230ab0 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -380,8 +380,6 @@ static __always_inline int fls64(__u64 x) #include #endif -#include - #include #include diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h index 3f71d364ba909..cd225896c40f4 100644 --- a/arch/xtensa/include/asm/bitops.h +++ b/arch/xtensa/include/asm/bitops.h @@ -205,7 +205,6 @@ BIT_OPS(change, "xor", ) #undef BIT_OP #undef TEST_AND_BIT_OP -#include #include #include diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index df9b5bc3d2827..a47b8a71d6fe1 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -20,7 +20,6 @@ #include #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index a36cfcec4e779..3f7c6731b203b 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/include/asm-generic/bitops/find.h b/include/linux/find.h similarity index 97% rename from include/asm-generic/bitops/find.h rename to include/linux/find.h index 91b1b23f2b0c6..c5410c243e046 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/linux/find.h @@ -1,6 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_BITOPS_FIND_H_ -#define _ASM_GENERIC_BITOPS_FIND_H_ +#ifndef __LINUX_FIND_H_ +#define __LINUX_FIND_H_ + +#ifndef __LINUX_BITMAP_H +#error only can be included directly +#endif + +#include extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, @@ -259,4 +265,4 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix " #endif -#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ +#endif /*__LINUX_FIND_H_ */ From dbbccfe505ef8fae804677abf0564c1e07479102 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:56 +1000 Subject: [PATCH 323/365] arch: remove GENERIC_FIND_FIRST_BIT entirely In 5.12 cycle we enabled GENERIC_FIND_FIRST_BIT config option for ARM64 and MIPS. It increased performance and shrunk .text size; and so far I didn't receive any negative feedback on the change. https://lore.kernel.org/linux-arch/20210225135700.1381396-1-yury.norov@gmail.com/ Now I think it's a good time to switch all architectures to use find_{first,last}_bit() unconditionally, and so remove corresponding config option. The patch does't introduce functioal changes for arc, arm, arm64, mips, m68k, s390 and x86, for other architectures I expect improvement both in performance and .text size. Link: https://lkml.kernel.org/r/20210814211713.180533-5-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Alexander Lobakin (mips) Reviewed-by: Alexander Lobakin (mips) Reviewed-by: Andy Shevchenko Acked-by: Will Deacon Tested-by: Wolfram Sang Cc: Alexey Klimov Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arc/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/um/Kconfig | 1 - include/linux/find.h | 13 ------------- lib/Kconfig | 3 --- 8 files changed, 22 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index b5bf68e747320..cf7842e2feafa 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -20,7 +20,6 @@ config ARC select COMMON_CLK select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC) - select GENERIC_FIND_FIRST_BIT # for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fdcd54d39c1ef..7b2269ad3e129 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -119,7 +119,6 @@ config ARM64 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP - select GENERIC_FIND_FIRST_BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_IPI select GENERIC_IRQ_PROBE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6dfb27d531dd7..e9c0cada4f7ca 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -30,7 +30,6 @@ config MIPS select GENERIC_ATOMIC64 if !64BIT select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE - select GENERIC_FIND_FIRST_BIT select GENERIC_GETTIMEOFDAY select GENERIC_IOMAP select GENERIC_IRQ_PROBE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a0e2130f0100c..e4368e2466b6f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -126,7 +126,6 @@ config S390 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_ENTRY - select GENERIC_FIND_FIRST_BIT select GENERIC_GETTIMEOFDAY select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 88fb922c23a0a..854c42f9f2f22 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -133,7 +133,6 @@ config X86 select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 95d26a69088b1..40d6a06e41c81 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -8,7 +8,6 @@ endmenu config UML_X86 def_bool y - select GENERIC_FIND_FIRST_BIT config 64BIT bool "64-bit kernel" if "$(SUBARCH)" = "x86" diff --git a/include/linux/find.h b/include/linux/find.h index c5410c243e046..ea57f7f38c497 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -101,8 +101,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, } #endif -#ifdef CONFIG_GENERIC_FIND_FIRST_BIT - #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region @@ -147,17 +145,6 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) } #endif -#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ - -#ifndef find_first_bit -#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) -#endif -#ifndef find_first_zero_bit -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) -#endif - -#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ - #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region diff --git a/lib/Kconfig b/lib/Kconfig index 5c9c0687f76d1..f8874240835b8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -59,9 +59,6 @@ config GENERIC_STRNLEN_USER config GENERIC_NET_UTILS bool -config GENERIC_FIND_FIRST_BIT - bool - source "lib/math/Kconfig" config NO_GENERIC_PCI_IOPORT_MAP From 6b6c111934a424e80ad7f8ef47a262f1a9e937fb Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:56 +1000 Subject: [PATCH 324/365] lib: add find_first_and_bit() Currently find_first_and_bit() is an alias to find_next_and_bit(). However, it is widely used in cpumask, so it worth to optimize it. This patch adds its own implementation for find_first_and_bit(). On x86_64 find_bit_benchmark says: Before (#define find_first_and_bit(...) find_next_and_bit(..., 0): Start testing find_bit() with random-filled bitmap [ 140.291468] find_first_and_bit: 46890919 ns, 32671 iterations Start testing find_bit() with sparse bitmap [ 140.295028] find_first_and_bit: 7103 ns, 1 iterations After: Start testing find_bit() with random-filled bitmap [ 162.574907] find_first_and_bit: 25045813 ns, 32846 iterations Start testing find_bit() with sparse bitmap [ 162.578458] find_first_and_bit: 4900 ns, 1 iterations (Thanks to Alexey Klimov for thorough testing.) Link: https://lkml.kernel.org/r/20210814211713.180533-6-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Tested-by: Alexey Klimov Cc: Alexander Lobakin Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/find.h | 27 +++++++++++++++++++++++++++ lib/find_bit.c | 21 +++++++++++++++++++++ lib/find_bit_benchmark.c | 21 +++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/include/linux/find.h b/include/linux/find.h index ea57f7f38c497..6048f8c974185 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -12,6 +12,8 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); @@ -123,6 +125,31 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_and_bit(addr1, addr2, size); +} +#endif + #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region diff --git a/lib/find_bit.c b/lib/find_bit.c index 0f8e2e369b1d2..1b8e4b2a9cba8 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -89,6 +89,27 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) EXPORT_SYMBOL(_find_first_bit); #endif +#ifndef find_first_and_bit +/* + * Find the first set bit in two memory regions. + */ +unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + unsigned long idx, val; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + val = addr1[idx] & addr2[idx]; + if (val) + return min(idx * BITS_PER_LONG + __ffs(val), size); + } + + return size; +} +EXPORT_SYMBOL(_find_first_and_bit); +#endif + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 5637c5711db94..db904b57d4b87 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -49,6 +49,25 @@ static int __init test_find_first_bit(void *bitmap, unsigned long len) return 0; } +static int __init test_find_first_and_bit(void *bitmap, const void *bitmap2, unsigned long len) +{ + static DECLARE_BITMAP(cp, BITMAP_LEN) __initdata; + unsigned long i, cnt; + ktime_t time; + + bitmap_copy(cp, bitmap, BITMAP_LEN); + + time = ktime_get(); + for (cnt = i = 0; i < len; cnt++) { + i = find_first_and_bit(cp, bitmap2, len); + __clear_bit(i, cp); + } + time = ktime_get() - time; + pr_err("find_first_and_bit: %18llu ns, %6ld iterations\n", time, cnt); + + return 0; +} + static int __init test_find_next_bit(const void *bitmap, unsigned long len) { unsigned long i, cnt; @@ -129,6 +148,7 @@ static int __init find_bit_test(void) * traverse only part of bitmap to avoid soft lockup. */ test_find_first_bit(bitmap, BITMAP_LEN / 10); + test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN / 2); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); pr_err("\nStart testing find_bit() with sparse bitmap\n"); @@ -145,6 +165,7 @@ static int __init find_bit_test(void) test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); test_find_first_bit(bitmap, BITMAP_LEN); + test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); /* From edc3a6eea1c7ace8d44b9c3f554e3c334192291c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:56 +1000 Subject: [PATCH 325/365] cpumask: use find_first_and_bit() Now we have an efficient implementation for find_first_and_bit(), so switch cpumask to use it where appropriate. Link: https://lkml.kernel.org/r/20210814211713.180533-7-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/cpumask.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f3689a52bfd05..4a03f6636e6ce 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,12 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return 0; +} + static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; @@ -167,7 +173,7 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { - return cpumask_next_and(-1, src1p, src2p); + return cpumask_first_and(src1p, src2p); } static inline int cpumask_any_distribute(const struct cpumask *srcp) @@ -195,6 +201,19 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 + * @src1p: the first input + * @src2p: the second input + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +static inline +unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) +{ + return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +} + /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer @@ -585,15 +604,6 @@ static inline void cpumask_copy(struct cpumask *dstp, */ #define cpumask_any(srcp) cpumask_first(srcp) -/** - * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 - * @src1p: the first input - * @src2p: the second input - * - * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). - */ -#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) - /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask From d49b694c3ef9d700e860b9854d8d5addae50651a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:56 +1000 Subject: [PATCH 326/365] all: replace find_next{,_zero}_bit with find_first{,_zero}_bit where appropriate find_first{,_zero}_bit is a more effective analogue of 'next' version if start == 0. This patch replaces 'next' with 'first' where things look trivial. Link: https://lkml.kernel.org/r/20210814211713.180533-8-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/platforms/pasemi/dma_lib.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/dma/ti/edma.c | 2 +- drivers/iio/adc/ad7124.c | 2 +- drivers/infiniband/hw/irdma/hw.c | 16 ++++++++-------- drivers/media/cec/core/cec-core.c | 2 +- drivers/media/mc/mc-devnode.c | 2 +- drivers/pci/controller/dwc/pci-dra7xx.c | 2 +- drivers/scsi/lpfc/lpfc_sli.c | 10 +++++----- drivers/soc/ti/k3-ringacc.c | 4 ++-- drivers/tty/n_tty.c | 2 +- drivers/virt/acrn/ioreq.c | 3 +-- fs/f2fs/segment.c | 8 ++++---- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/dlm/dlmdomain.c | 4 ++-- fs/ocfs2/dlm/dlmmaster.c | 18 +++++++++--------- fs/ocfs2/dlm/dlmrecovery.c | 2 +- fs/ocfs2/dlm/dlmthread.c | 2 +- lib/genalloc.c | 2 +- net/ncsi/ncsi-manage.c | 4 ++-- 21 files changed, 47 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index 270fa3c0d3724..26427311fc725 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -375,7 +375,7 @@ int pasemi_dma_alloc_flag(void) int bit; retry: - bit = find_next_bit(flags_free, MAX_FLAGS, 0); + bit = find_first_bit(flags_free, MAX_FLAGS); if (bit >= MAX_FLAGS) return -ENOSPC; if (!test_and_clear_bit(bit, flags_free)) @@ -440,7 +440,7 @@ int pasemi_dma_alloc_fun(void) int bit; retry: - bit = find_next_bit(fun_free, MAX_FLAGS, 0); + bit = find_first_bit(fun_free, MAX_FLAGS); if (bit >= MAX_FLAGS) return -ENOSPC; if (!test_and_clear_bit(bit, fun_free)) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4527ac7b5961d..869942ff1c821 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2023,7 +2023,7 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, while ((slotidx > 0) && (ofs >= ms->npages)) { slotidx--; ms = slots->memslots + slotidx; - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); + ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); } return ms->base_gfn + ofs; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index e9cc413495f0c..90797f3956e37 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -196,7 +196,7 @@ rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) return per_cpu_ptr(sess->cpu_queues, bit); } else if (cpu != 0) { /* Search from 0 to cpu */ - bit = find_next_bit(sess->cpu_queues_bm, cpu, 0); + bit = find_first_bit(sess->cpu_queues_bm, cpu); if (bit < cpu) return per_cpu_ptr(sess->cpu_queues, bit); } diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index 35d81bd857f11..caa4050ecc021 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -1681,7 +1681,7 @@ static irqreturn_t dma_ccerr_handler(int irq, void *data) dev_dbg(ecc->dev, "EMR%d 0x%08x\n", j, val); emr = val; - for (i = find_next_bit(&emr, 32, 0); i < 32; + for (i = find_first_bit(&emr, 32); i < 32; i = find_next_bit(&emr, 32, i + 1)) { int k = (j << 5) + i; diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index e45c600fccc0b..bc2cfa5f95922 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -347,7 +347,7 @@ static int ad7124_find_free_config_slot(struct ad7124_state *st) { unsigned int free_cfg_slot; - free_cfg_slot = find_next_zero_bit(&st->cfg_slots_status, AD7124_MAX_CONFIGS, 0); + free_cfg_slot = find_first_zero_bit(&st->cfg_slots_status, AD7124_MAX_CONFIGS); if (free_cfg_slot == AD7124_MAX_CONFIGS) return -1; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 00de5ee9a2609..a003893a431ed 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1696,14 +1696,14 @@ static enum irdma_status_code irdma_setup_init_state(struct irdma_pci_f *rf) */ static void irdma_get_used_rsrc(struct irdma_device *iwdev) { - iwdev->rf->used_pds = find_next_zero_bit(iwdev->rf->allocated_pds, - iwdev->rf->max_pd, 0); - iwdev->rf->used_qps = find_next_zero_bit(iwdev->rf->allocated_qps, - iwdev->rf->max_qp, 0); - iwdev->rf->used_cqs = find_next_zero_bit(iwdev->rf->allocated_cqs, - iwdev->rf->max_cq, 0); - iwdev->rf->used_mrs = find_next_zero_bit(iwdev->rf->allocated_mrs, - iwdev->rf->max_mr, 0); + iwdev->rf->used_pds = find_first_zero_bit(iwdev->rf->allocated_pds, + iwdev->rf->max_pd); + iwdev->rf->used_qps = find_first_zero_bit(iwdev->rf->allocated_qps, + iwdev->rf->max_qp); + iwdev->rf->used_cqs = find_first_zero_bit(iwdev->rf->allocated_cqs, + iwdev->rf->max_cq); + iwdev->rf->used_mrs = find_first_zero_bit(iwdev->rf->allocated_mrs, + iwdev->rf->max_mr); } void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf) diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index 551689d371a71..7322e7cd9753e 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -106,7 +106,7 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode, /* Part 1: Find a free minor number */ mutex_lock(&cec_devnode_lock); - minor = find_next_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES, 0); + minor = find_first_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES); if (minor == CEC_NUM_DEVICES) { mutex_unlock(&cec_devnode_lock); pr_err("could not get a free minor\n"); diff --git a/drivers/media/mc/mc-devnode.c b/drivers/media/mc/mc-devnode.c index f11382afe23bf..680fbb3a93402 100644 --- a/drivers/media/mc/mc-devnode.c +++ b/drivers/media/mc/mc-devnode.c @@ -217,7 +217,7 @@ int __must_check media_devnode_register(struct media_device *mdev, /* Part 1: Find a free minor number */ mutex_lock(&media_devnode_lock); - minor = find_next_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES, 0); + minor = find_first_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES); if (minor == MEDIA_NUM_DEVICES) { mutex_unlock(&media_devnode_lock); pr_err("could not get a free minor\n"); diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index 047cfbdc13303..8d7ec6448d5fc 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -211,7 +211,7 @@ static int dra7xx_pcie_handle_msi(struct pcie_port *pp, int index) if (!val) return 0; - pos = find_next_bit(&val, MAX_MSI_IRQS_PER_CTRL, 0); + pos = find_first_bit(&val, MAX_MSI_IRQS_PER_CTRL); while (pos != MAX_MSI_IRQS_PER_CTRL) { irq = irq_find_mapping(pp->irq_domain, (index * MAX_MSI_IRQS_PER_CTRL) + pos); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index f530d8fe7a8ce..f8f65b4cb44c9 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -17250,8 +17250,8 @@ lpfc_sli4_alloc_xri(struct lpfc_hba *phba) * the driver starts at 0 each time. */ spin_lock_irq(&phba->hbalock); - xri = find_next_zero_bit(phba->sli4_hba.xri_bmask, - phba->sli4_hba.max_cfg_param.max_xri, 0); + xri = find_first_zero_bit(phba->sli4_hba.xri_bmask, + phba->sli4_hba.max_cfg_param.max_xri); if (xri >= phba->sli4_hba.max_cfg_param.max_xri) { spin_unlock_irq(&phba->hbalock); return NO_XRI; @@ -18928,7 +18928,7 @@ lpfc_sli4_alloc_rpi(struct lpfc_hba *phba) max_rpi = phba->sli4_hba.max_cfg_param.max_rpi; rpi_limit = phba->sli4_hba.next_rpi; - rpi = find_next_zero_bit(phba->sli4_hba.rpi_bmask, rpi_limit, 0); + rpi = find_first_zero_bit(phba->sli4_hba.rpi_bmask, rpi_limit); if (rpi >= rpi_limit) rpi = LPFC_RPI_ALLOC_ERROR; else { @@ -19571,8 +19571,8 @@ lpfc_sli4_fcf_rr_next_index_get(struct lpfc_hba *phba) * have been tested so that we can detect when we should * change the priority level. */ - next_fcf_index = find_next_bit(phba->fcf.fcf_rr_bmask, - LPFC_SLI4_FCF_TBL_INDX_MAX, 0); + next_fcf_index = find_first_bit(phba->fcf.fcf_rr_bmask, + LPFC_SLI4_FCF_TBL_INDX_MAX); } diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c index 312ba0f98ad79..573be88f81910 100644 --- a/drivers/soc/ti/k3-ringacc.c +++ b/drivers/soc/ti/k3-ringacc.c @@ -358,8 +358,8 @@ struct k3_ring *k3_ringacc_request_ring(struct k3_ringacc *ringacc, goto out; if (flags & K3_RINGACC_RING_USE_PROXY) { - proxy_id = find_next_zero_bit(ringacc->proxy_inuse, - ringacc->num_proxies, 0); + proxy_id = find_first_zero_bit(ringacc->proxy_inuse, + ringacc->num_proxies); if (proxy_id == ringacc->num_proxies) goto error; } diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 0ec93f1a61f5d..0965793dfe4f5 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1975,7 +1975,7 @@ static bool canon_copy_from_read_buf(struct tty_struct *tty, more = n - (size - tail); if (eol == N_TTY_BUF_SIZE && more) { /* scan wrapped without finding set bit */ - eol = find_next_bit(ldata->read_flags, more, 0); + eol = find_first_bit(ldata->read_flags, more); found = eol != more; } else found = eol != size; diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c index 80b2e3f0e2764..5ff1c53740c00 100644 --- a/drivers/virt/acrn/ioreq.c +++ b/drivers/virt/acrn/ioreq.c @@ -246,8 +246,7 @@ void acrn_ioreq_request_clear(struct acrn_vm *vm) spin_lock_bh(&vm->ioreq_clients_lock); client = vm->default_client; if (client) { - vcpu = find_next_bit(client->ioreqs_map, - ACRN_IO_REQUEST_MAX, 0); + vcpu = find_first_bit(client->ioreqs_map, ACRN_IO_REQUEST_MAX); while (vcpu < ACRN_IO_REQUEST_MAX) { acrn_ioreq_complete_request(client, vcpu, NULL); vcpu = find_next_bit(client->ioreqs_map, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 15cc89eef28d6..f8dcd16f6b1f4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2495,8 +2495,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { - secno = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; @@ -2511,8 +2511,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, left_start--; continue; } - left_start = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + left_start = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f89ffcbd585ff..a17be1618bf70 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -379,7 +379,7 @@ static void o2hb_nego_timeout(struct work_struct *work) o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); /* lowest node as master node to make negotiate decision. */ - master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); if (master_node == o2nm_this_node()) { if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9f90fc9551e1a..c4eccd499db8a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1045,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) int status, ret = 0, i; char *p; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); @@ -1217,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) struct o2nm_node *node; int ret = 0, status, count, i; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9b88219febb52..227da5b1b6ab1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -861,7 +861,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, * to see if there are any nodes that still need to be * considered. these will not appear in the mle nodemap * but they might own this lockres. wait on them. */ - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -912,7 +912,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -1079,7 +1079,7 @@ static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, sleep = 1; /* have all nodes responded? */ if (voting_done && !*blocked) { - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (dlm->node_num <= bit) { /* my node number is lowest. * now tell other nodes that I am @@ -1234,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, } else { mlog(ML_ERROR, "node down! %d\n", node); if (blocked) { - int lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, 0); + int lowest = find_first_bit(mle->maybe_map, + O2NM_MAX_NODES); /* act like it was never there */ clear_bit(node, mle->maybe_map); @@ -1795,7 +1795,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, "MLE for it! (%.*s)\n", assert->node_idx, namelen, name); } else { - int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ @@ -2521,7 +2521,7 @@ static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, } if (!nonlocal) { - node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES); if (node_ref >= O2NM_MAX_NODES) return 0; } @@ -3303,7 +3303,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm, BUG_ON(mle->type != DLM_MLE_BLOCK); spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit != dead_node) { mlog(0, "mle found, but dead node %u would not have been " "master\n", dead_node); @@ -3542,7 +3542,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) spin_lock(&dlm->master_lock); BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); - BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES)); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0e7aad1b11cc4..e24e327524f85 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -451,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c350bd4df7701..eedf07ca23ca4 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,7 +92,7 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) return 0; /* Another node has this resource with this node as the master */ - bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + bit = find_first_bit(res->refmap, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) return 0; diff --git a/lib/genalloc.c b/lib/genalloc.c index 9a57257988c77..00fc50d0a640e 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -251,7 +251,7 @@ void gen_pool_destroy(struct gen_pool *pool) list_del(&chunk->next_chunk); end_bit = chunk_size(chunk) >> order; - bit = find_next_bit(chunk->bits, end_bit, 0); + bit = find_first_bit(chunk->bits, end_bit); BUG_ON(bit < end_bit); vfree(chunk); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 89c7742cd72e2..a554837d86ddb 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -608,7 +608,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, bitmap = &ncf->bitmap; spin_lock_irqsave(&nc->lock, flags); - index = find_next_bit(bitmap, ncf->n_vids, 0); + index = find_first_bit(bitmap, ncf->n_vids); if (index >= ncf->n_vids) { spin_unlock_irqrestore(&nc->lock, flags); return -1; @@ -667,7 +667,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return -1; } - index = find_next_zero_bit(bitmap, ncf->n_vids, 0); + index = find_first_zero_bit(bitmap, ncf->n_vids); if (index < 0 || index >= ncf->n_vids) { netdev_err(ndp->ndev.dev, "Channel %u already has all VLAN filters set\n", From 0803a22a559adfbb78289b297a88d34db64a173d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:57 +1000 Subject: [PATCH 327/365] tools: sync tools/bitmap with mother linux Remove tools/include/asm-generic/bitops/find.h and copy include/linux/bitmap.h to tools. find_*_le() functions are not copied because not needed in tools. Link: https://lkml.kernel.org/r/20210814211713.180533-9-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- MAINTAINERS | 2 +- tools/include/asm-generic/bitops.h | 1 - tools/include/linux/bitmap.h | 7 +- .../{asm-generic/bitops => linux}/find.h | 81 +++++++++++++++++-- tools/lib/find_bit.c | 20 +++++ 5 files changed, 100 insertions(+), 11 deletions(-) rename tools/include/{asm-generic/bitops => linux}/find.h (63%) diff --git a/MAINTAINERS b/MAINTAINERS index 2c2a611969863..00ff6015b0b1c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3268,8 +3268,8 @@ F: lib/bitmap.c F: lib/find_bit.c F: lib/find_bit_benchmark.c F: lib/test_bitmap.c -F: tools/include/asm-generic/bitops/find.h F: tools/include/linux/bitmap.h +F: tools/include/linux/find.h F: tools/lib/bitmap.c F: tools/lib/find_bit.c diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h index 5d2ab38965cca..9ab313e93555a 100644 --- a/tools/include/asm-generic/bitops.h +++ b/tools/include/asm-generic/bitops.h @@ -18,7 +18,6 @@ #include #include #include -#include #ifndef _TOOLS_LINUX_BITOPS_H_ #error only can be included directly diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 9d959bc248592..13d90b574970d 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -1,9 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PERF_BITOPS_H -#define _PERF_BITOPS_H +#ifndef _TOOLS_LINUX_BITMAP_H +#define _TOOLS_LINUX_BITMAP_H #include #include +#include #include #include @@ -181,4 +182,4 @@ static inline int bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -#endif /* _PERF_BITOPS_H */ +#endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/linux/find.h similarity index 63% rename from tools/include/asm-generic/bitops/find.h rename to tools/include/linux/find.h index 6481fd11012a9..47e2bd6c51745 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/linux/find.h @@ -1,11 +1,19 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ -#define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ +#ifndef _TOOLS_LINUX_FIND_H_ +#define _TOOLS_LINUX_FIND_H_ + +#ifndef _TOOLS_LINUX_BITMAP_H +#error tools: only can be included directly +#endif + +#include extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); @@ -96,7 +104,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, #endif #ifndef find_first_bit - /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at @@ -116,11 +123,34 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) return _find_first_bit(addr, size); } +#endif + +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); -#endif /* find_first_bit */ + return val ? __ffs(val) : size; + } -#ifndef find_first_zero_bit + return _find_first_and_bit(addr1, addr2, size); +} +#endif +#ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at @@ -142,4 +172,43 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) } #endif -#endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */ +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + +/** + * find_next_clump8 - find next 8-bit clump with set bits in a memory region + * @clump: location to store copy of found clump + * @addr: address to base the search on + * @size: bitmap size in number of bits + * @offset: bit offset at which to start searching + * + * Returns the bit offset for the next set clump; the found clump value is + * copied to the location pointed by @clump. If no bits are set, returns @size. + */ +extern unsigned long find_next_clump8(unsigned long *clump, + const unsigned long *addr, + unsigned long size, unsigned long offset); + +#define find_first_clump8(clump, bits, size) \ + find_next_clump8((clump), (bits), (size), 0) + + +#endif /*__LINUX_FIND_H_ */ diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 109aa7ffcf973..ba4b8d94e0048 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -96,6 +96,26 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/* + * Find the first set bit in two memory regions. + */ +unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + unsigned long idx, val; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + val = addr1[idx] & addr2[idx]; + if (val) + return min(idx * BITS_PER_LONG + __ffs(val), size); + } + + return size; +} +#endif + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. From 67d486cbe46e30cb5f285f19c3225c2efbc67c74 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:57 +1000 Subject: [PATCH 328/365] cpumask: replace cpumask_next_* with cpumask_first_* where appropriate cpumask_first() is a more effective analogue of 'next' version if n == -1 (which means start == 0). This patch replaces 'next' with 'first' where things look trivial. There's no cpumask_first_zero() function, so create it. Link: https://lkml.kernel.org/r/20210814211713.180533-10-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/include/asm/cputhreads.h | 2 +- block/blk-mq.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/soc/fsl/qbman/bman_portal.c | 2 +- drivers/soc/fsl/qbman/qman_portal.c | 2 +- include/linux/cpumask.h | 16 ++++++++++++++++ kernel/time/clocksource.c | 4 ++-- 7 files changed, 23 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index b167186aaee4a..44286df21d2ae 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -52,7 +52,7 @@ static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads) for (i = 0; i < NR_CPUS; i += threads_per_core) { cpumask_shift_left(&tmp, &threads_core_mask, i); if (cpumask_intersects(threads, &tmp)) { - cpu = cpumask_next_and(-1, &tmp, cpu_online_mask); + cpu = cpumask_first_and(&tmp, cpu_online_mask); if (cpu < nr_cpu_ids) cpumask_set_cpu(cpu, &res); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d4fdc2be88a5..2bf57e0cce507 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2524,7 +2524,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { - if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index eee493685aad5..4d04caca09bb0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2091,7 +2091,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c index acda8a5637c52..4d7b9caee1c47 100644 --- a/drivers/soc/fsl/qbman/bman_portal.c +++ b/drivers/soc/fsl/qbman/bman_portal.c @@ -155,7 +155,7 @@ static int bman_portal_probe(struct platform_device *pdev) } spin_lock(&bman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __bman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c index 96f74a1dc6035..e23b60618c1a1 100644 --- a/drivers/soc/fsl/qbman/qman_portal.c +++ b/drivers/soc/fsl/qbman/qman_portal.c @@ -248,7 +248,7 @@ static int qman_portal_probe(struct platform_device *pdev) pcfg->pools = qm_get_pools_sdqcr(); spin_lock(&qman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __qman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 4a03f6636e6ce..f5883a8f28caa 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return 0; +} + static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -201,6 +206,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_zero - get the first unset cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if all cpus are set. + */ +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b89c76e1c02c4..fbbee6fca1ba8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -257,7 +257,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (cpu == smp_processor_id()) cpu = cpumask_next(cpu, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) @@ -279,7 +279,7 @@ static void clocksource_verify_choose_cpus(void) cpu = prandom_u32() % nr_cpu_ids; cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } From 748900a6d44ce4f961b1be3942a04d2b8c952c2b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:57 +1000 Subject: [PATCH 329/365] include/linux: move for_each_bit() macros from bitops.h to find.h for_each_bit() macros depend on find_bit() machinery, and so the proper place for them is the find.h header. Link: https://lkml.kernel.org/r/20210814211713.180533-11-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/bitops.h | 34 ---------------------------------- include/linux/find.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 26bf15e6cd35d..31ae1ae1a974d 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -31,40 +31,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include -#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_from(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -#define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/* same as for_each_clear_bit() but use bit as value to start with */ -#define for_each_clear_bit_from(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/** - * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits - * @start: bit offset to start search and to store the current iteration offset - * @clump: location to store copy of current 8-bit clump - * @bits: bitmap address to base the search on - * @size: bitmap size in number of bits - */ -#define for_each_set_clump8(start, clump, bits, size) \ - for ((start) = find_first_clump8(&(clump), (bits), (size)); \ - (start) < (size); \ - (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) - static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/find.h b/include/linux/find.h index 6048f8c974185..4500e8ab93e24 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix " #endif +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define for_each_clear_bit(bit, addr, size) \ + for ((bit) = find_first_zero_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/** + * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits + * @start: bit offset to start search and to store the current iteration offset + * @clump: location to store copy of current 8-bit clump + * @bits: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_clump8(start, clump, bits, size) \ + for ((start) = find_first_clump8(&(clump), (bits), (size)); \ + (start) < (size); \ + (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) + #endif /*__LINUX_FIND_H_ */ From e4beeb99bc4b61ff5a958572348543f382b61dfc Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:57 +1000 Subject: [PATCH 330/365] find: micro-optimize for_each_{set,clear}_bit() The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit(). Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit(). Link: https://lkml.kernel.org/r/20210814211713.180533-12-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e24..ae9ed52b52b8b 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ + for ((bit) = find_next_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) @@ -291,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_bit((addr), (size), (bit) + 1)) #define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ + for ((bit) = find_next_zero_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) From c83c6d80daded5580aa697dc3726e10fa06dab26 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:57 +1000 Subject: [PATCH 331/365] bitops: replace for_each_*_bit_from() with for_each_*_bit() where appropriate A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit(). No functional changes, but might improve on readability. Link: https://lkml.kernel.org/r/20210814211713.180533-13-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fb67ed5e7e6a8..d099ef226f554 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void) void __init lapic_assign_system_vectors(void) { - unsigned int i, vector = 0; + unsigned int i, vector; - for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false); if (nr_legacy_irqs() > 1) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 4102bcea33413..42ce3287d3be8 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1032,7 +1032,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m) void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) { - unsigned int i = 0; + unsigned int i; dev_err(gpu->dev, "recover hung GPU!\n"); @@ -1045,7 +1045,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) /* complete all events, the GPU won't do it after the reset */ spin_lock(&gpu->event_spinlock); - for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS) + for_each_set_bit(i, gpu->event_bitmap, ETNA_NR_EVENTS) complete(&gpu->event_free); bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS); spin_unlock(&gpu->event_spinlock); diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 2a4bed0ab226b..7352d2b3c756d 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask gpio_status = reg; - gpio_nr = 0; - for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) { + for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); } From a008ed586dbcad411e1742c0af1be071a97886a0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 24 Aug 2021 09:59:58 +1000 Subject: [PATCH 332/365] tools: rename bitmap_alloc() to bitmap_zalloc() Rename bitmap_alloc() to bitmap_zalloc() in tools to follow the bitmap API in the kernel. No functional changes intended. Link: https://lkml.kernel.org/r/20210814211713.180533-14-yury.norov@gmail.com Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov Suggested-by: Yury Norov Acked-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Jiri Olsa Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Dennis Zhou Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/include/linux/bitmap.h | 4 ++-- tools/perf/bench/find-bit-bench.c | 2 +- tools/perf/builtin-c2c.c | 6 +++--- tools/perf/builtin-record.c | 2 +- tools/perf/tests/bitmap.c | 2 +- tools/perf/tests/mem2node.c | 2 +- tools/perf/util/affinity.c | 4 ++-- tools/perf/util/header.c | 4 ++-- tools/perf/util/metricgroup.c | 2 +- tools/perf/util/mmap.c | 4 ++-- tools/testing/selftests/kvm/dirty_log_perf_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 4 ++-- tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c | 2 +- 13 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 13d90b574970d..ea97804d04d4a 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -112,10 +112,10 @@ static inline int test_and_clear_bit(int nr, unsigned long *addr) } /** - * bitmap_alloc - Allocate bitmap + * bitmap_zalloc - Allocate bitmap * @nbits: Number of bits */ -static inline unsigned long *bitmap_alloc(int nbits) +static inline unsigned long *bitmap_zalloc(int nbits) { return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long)); } diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c index 73b5bcc5946ad..22b5cfe970237 100644 --- a/tools/perf/bench/find-bit-bench.c +++ b/tools/perf/bench/find-bit-bench.c @@ -54,7 +54,7 @@ static bool asm_test_bit(long nr, const unsigned long *addr) static int do_for_each_set_bit(unsigned int num_bits) { - unsigned long *to_test = bitmap_alloc(num_bits); + unsigned long *to_test = bitmap_zalloc(num_bits); struct timeval start, end, diff; u64 runtime_us; struct stats fb_time_stats, tb_time_stats; diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 6dea37f141b25..c34d77bee4ef3 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -139,11 +139,11 @@ static void *c2c_he_zalloc(size_t size) if (!c2c_he) return NULL; - c2c_he->cpuset = bitmap_alloc(c2c.cpus_cnt); + c2c_he->cpuset = bitmap_zalloc(c2c.cpus_cnt); if (!c2c_he->cpuset) return NULL; - c2c_he->nodeset = bitmap_alloc(c2c.nodes_cnt); + c2c_he->nodeset = bitmap_zalloc(c2c.nodes_cnt); if (!c2c_he->nodeset) return NULL; @@ -2047,7 +2047,7 @@ static int setup_nodes(struct perf_session *session) struct perf_cpu_map *map = n[node].map; unsigned long *set; - set = bitmap_alloc(c2c.cpus_cnt); + set = bitmap_zalloc(c2c.cpus_cnt); if (!set) return -ENOMEM; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 671a21c9ee4d7..f1b30ac094cbd 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2786,7 +2786,7 @@ int cmd_record(int argc, const char **argv) if (rec->opts.affinity != PERF_AFFINITY_SYS) { rec->affinity_mask.nbits = cpu__max_cpu(); - rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits); + rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits); if (!rec->affinity_mask.bits) { pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits); err = -ENOMEM; diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index 96c137360918f..12b805efdca0d 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -14,7 +14,7 @@ static unsigned long *get_bitmap(const char *str, int nbits) unsigned long *bm = NULL; int i; - bm = bitmap_alloc(nbits); + bm = bitmap_zalloc(nbits); if (map && bm) { for (i = 0; i < map->nr; i++) diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c index a258bd51f1a41..e4d0d58b97f81 100644 --- a/tools/perf/tests/mem2node.c +++ b/tools/perf/tests/mem2node.c @@ -27,7 +27,7 @@ static unsigned long *get_bitmap(const char *str, int nbits) unsigned long *bm = NULL; int i; - bm = bitmap_alloc(nbits); + bm = bitmap_zalloc(nbits); if (map && bm) { for (i = 0; i < map->nr; i++) { diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c index a5e31f8268280..7b12bd7a30802 100644 --- a/tools/perf/util/affinity.c +++ b/tools/perf/util/affinity.c @@ -25,11 +25,11 @@ int affinity__setup(struct affinity *a) { int cpu_set_size = get_cpu_set_size(); - a->orig_cpus = bitmap_alloc(cpu_set_size * 8); + a->orig_cpus = bitmap_zalloc(cpu_set_size * 8); if (!a->orig_cpus) return -1; sched_getaffinity(0, cpu_set_size, (cpu_set_t *)a->orig_cpus); - a->sched_cpus = bitmap_alloc(cpu_set_size * 8); + a->sched_cpus = bitmap_zalloc(cpu_set_size * 8); if (!a->sched_cpus) { zfree(&a->orig_cpus); return -1; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 44249027507a5..563dec72adeb2 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -278,7 +278,7 @@ static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize) if (ret) return ret; - set = bitmap_alloc(size); + set = bitmap_zalloc(size); if (!set) return -ENOMEM; @@ -1294,7 +1294,7 @@ static int memory_node__read(struct memory_node *n, unsigned long idx) size++; - n->set = bitmap_alloc(size); + n->set = bitmap_zalloc(size); if (!n->set) { closedir(dir); return -ENOMEM; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 99d047c5ead0f..29b747ac31c12 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -313,7 +313,7 @@ static int metricgroup__setup_events(struct list_head *groups, struct evsel *evsel, *tmp; unsigned long *evlist_used; - evlist_used = bitmap_alloc(perf_evlist->core.nr_entries); + evlist_used = bitmap_zalloc(perf_evlist->core.nr_entries); if (!evlist_used) return -ENOMEM; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index ab7108d22428b..512dc8b9c1685 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -106,7 +106,7 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) data = map->aio.data[idx]; mmap_len = mmap__mmap_len(map); node_index = cpu__get_node(cpu); - node_mask = bitmap_alloc(node_index + 1); + node_mask = bitmap_zalloc(node_index + 1); if (!node_mask) { pr_err("Failed to allocate node mask for mbind: error %m\n"); return -1; @@ -258,7 +258,7 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask) static int perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *mp) { map->affinity_mask.nbits = cpu__max_cpu(); - map->affinity_mask.bits = bitmap_alloc(map->affinity_mask.nbits); + map->affinity_mask.bits = bitmap_zalloc(map->affinity_mask.nbits); if (!map->affinity_mask.bits) return -1; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 80cbd3a748c02..f0dd381e0a16e 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -121,7 +121,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_num_pages = vm_num_host_pages(mode, guest_num_pages); - bmap = bitmap_alloc(host_num_pages); + bmap = bitmap_zalloc(host_num_pages); if (dirty_log_manual_caps) { cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 5fe0140e407e6..792c60e1b17dd 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -749,8 +749,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - bmap = bitmap_alloc(host_num_pages); - host_bmap_track = bitmap_alloc(host_num_pages); + bmap = bitmap_zalloc(host_num_pages); + host_bmap_track = bitmap_zalloc(host_num_pages); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 06a64980a5d25..68f26a8b4f42c 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -111,7 +111,7 @@ int main(int argc, char *argv[]) nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); - bmap = bitmap_alloc(TEST_MEM_PAGES); + bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { From 504fb2dec1b8192d8d419b6133538467fc1347d0 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:58 +1000 Subject: [PATCH 333/365] mm/percpu: micro-optimize pcpu_is_populated() bitmap_next_clear_region() calls find_next_zero_bit() and find_next_bit() sequentially to find a range of clear bits. In case of pcpu_is_populated() there's a chance to return earlier if bitmap has all bits set. Link: https://lkml.kernel.org/r/20210814211713.180533-15-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/percpu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a43039629aa46..8bf8b88446d73 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1070,17 +1070,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { - unsigned int page_start, page_end, rs, re; + unsigned int start, end; - page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); - page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - rs = page_start; - bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); - if (rs >= page_end) + start = find_next_zero_bit(chunk->populated, end, start); + if (start >= end) return true; - *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + end = find_next_bit(chunk->populated, end, start + 1); + + *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } From c20500c9abe9333b13375191f65999741d950575 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:58 +1000 Subject: [PATCH 334/365] bitmap: unify find_bit operations bitmap_for_each_{set,clear}_region() are similar to for_each_bit() macros in include/linux/find.h, but interface and implementation of them are different. This patch adds for_each_bitrange() macros and drops unused bitmap_*_region() API in sake of unification. Link: https://lkml.kernel.org/r/20210814211713.180533-16-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou Acked-by: Ulf Hansson [MMC] Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Jiri Olsa Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/mmc/host/renesas_sdhi_core.c | 2 +- include/linux/bitmap.h | 33 ---------------- include/linux/find.h | 56 ++++++++++++++++++++++++++++ mm/percpu.c | 20 ++++------ 4 files changed, 65 insertions(+), 46 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index e49ca0f7fe9a8..efd33b1fc4675 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -647,7 +647,7 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) * is at least SH_MOBILE_SDHI_MIN_TAP_ROW probes long then use the * center index as the tap, otherwise bail out. */ - bitmap_for_each_set_region(bitmap, rs, re, 0, taps_size) { + for_each_set_bitrange(rs, re, bitmap, taps_size) { if (re - rs > tap_cnt) { tap_end = re; tap_start = rs; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3f7c6731b203b..6938edd457c2d 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -55,12 +55,6 @@ struct device; * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above - * bitmap_next_clear_region(map, &start, &end, nbits) Find next clear region - * bitmap_next_set_region(map, &start, &end, nbits) Find next set region - * bitmap_for_each_clear_region(map, rs, re, start, end) - * Iterate over all clear regions - * bitmap_for_each_set_region(map, rs, re, start, end) - * Iterate over all set regions * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest @@ -459,14 +453,6 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline void bitmap_next_clear_region(unsigned long *bitmap, - unsigned int *rs, unsigned int *re, - unsigned int end) -{ - *rs = find_next_zero_bit(bitmap, end, *rs); - *re = find_next_bit(bitmap, end, *rs + 1); -} - static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) @@ -475,25 +461,6 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, *re = find_next_zero_bit(bitmap, end, *rs + 1); } -/* - * Bitmap region iterators. Iterates over the bitmap between [@start, @end). - * @rs and @re should be integer variables and will be set to start and end - * index of the current clear or set region. - */ -#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end))) - -#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end))) - /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value diff --git a/include/linux/find.h b/include/linux/find.h index ae9ed52b52b8b..5bb6db213bcb8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -301,6 +301,62 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) +/** + * for_each_set_bitrange - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit) + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), 0), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first unset bit) + * @e: bit offset of end of current bitrange (first set bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), 0), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), (b)), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset diff --git a/mm/percpu.c b/mm/percpu.c index 8bf8b88446d73..8c16c8e04eee5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -779,7 +779,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - unsigned int rs, re, start; /* region start, region end */ + unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { @@ -795,9 +795,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) block->right_free = 0; /* iterate over free areas and update the contig hints */ - bitmap_for_each_clear_region(alloc_map, rs, re, start, - PCPU_BITMAP_BLOCK_BITS) - pcpu_block_update(block, rs, re); + for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) + pcpu_block_update(block, start, end); } /** @@ -1855,13 +1854,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, /* populate if not all pages are already there */ if (!is_atomic) { - unsigned int page_start, page_end, rs, re; + unsigned int page_end, rs, re; - page_start = PFN_DOWN(off); + rs = PFN_DOWN(off); page_end = PFN_UP(off + size); - bitmap_for_each_clear_region(chunk->populated, rs, re, - page_start, page_end) { + for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); @@ -2017,8 +2015,7 @@ static void pcpu_balance_free(bool empty_only) list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; - bitmap_for_each_set_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -2088,8 +2085,7 @@ static void pcpu_balance_populated(void) continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - bitmap_for_each_clear_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); From f0026b82ef9c801bf6991505c14451e278341d10 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:58 +1000 Subject: [PATCH 335/365] lib: bitmap: add performance test for bitmap_print_to_pagebuf Functional tests for bitmap_print_to_pagebuf() are provided in lib/test_printf.c. This patch adds performance test for a case of fully set bitmap. Link: https://lkml.kernel.org/r/20210814211713.180533-17-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_bitmap.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 4ea73f5aed41f..452d525007da3 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -430,6 +430,42 @@ static void __init test_bitmap_parselist(void) } } +static void __init test_bitmap_printlist(void) +{ + unsigned long *bmap = kmalloc(PAGE_SIZE, GFP_KERNEL); + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + char expected[256]; + int ret, slen; + ktime_t time; + + if (!buf || !bmap) + goto out; + + memset(bmap, -1, PAGE_SIZE); + slen = snprintf(expected, 256, "0-%ld", PAGE_SIZE * 8 - 1); + if (slen < 0) + goto out; + + time = ktime_get(); + ret = bitmap_print_to_pagebuf(true, buf, bmap, PAGE_SIZE * 8); + time = ktime_get() - time; + + if (ret != slen + 1) { + pr_err("bitmap_print_to_pagebuf: result is %d, expected %d\n", ret, slen); + goto out; + } + + if (strncmp(buf, expected, slen)) { + pr_err("bitmap_print_to_pagebuf: result is %s, expected %s\n", buf, expected); + goto out; + } + + pr_err("bitmap_print_to_pagebuf: input is '%s', Time: %llu\n", buf, time); +out: + kfree(buf); + kfree(bmap); +} + static const unsigned long parse_test[] __initconst = { BITMAP_FROM_U64(0), BITMAP_FROM_U64(1), @@ -669,6 +705,7 @@ static void __init selftest(void) test_bitmap_arr32(); test_bitmap_parse(); test_bitmap_parselist(); + test_bitmap_printlist(); test_mem_optimisations(); test_for_each_set_clump8(); test_bitmap_cut(); From ec91085ca2fca97c3caa548e01066a8f6e6c8ada Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:58 +1000 Subject: [PATCH 336/365] vsprintf: rework bitmap_list_string bitmap_list_string() is very ineffective when printing bitmaps with long ranges of set bits because it calls find_next_bit for each bit in the bitmap. We can do better by detecting ranges of set bits. In my environment, before/after is 943008/31008 ns. Link: https://lkml.kernel.org/r/20210814211713.180533-18-yury.norov@gmail.com Signed-off-by: Yury Norov Tested-by: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/vsprintf.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index cc7bdd3ac2ee8..cd9f359c5bf9e 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1241,20 +1241,13 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); - /* current bit is 'cur', most recently seen range is [rbot, rtop] */ - int cur, rbot, rtop; bool first = true; + int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; - rbot = cur = find_first_bit(bitmap, nr_bits); - while (cur < nr_bits) { - rtop = cur; - cur = find_next_bit(bitmap, nr_bits, cur + 1); - if (cur < nr_bits && cur <= rtop + 1) - continue; - + for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; @@ -1263,15 +1256,12 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, first = false; buf = number(buf, end, rbot, default_dec_spec); - if (rbot < rtop) { - if (buf < end) - *buf = '-'; - buf++; - - buf = number(buf, end, rtop, default_dec_spec); - } + if (rtop == rbot + 1) + continue; - rbot = cur; + if (buf < end) + *buf = '-'; + buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } From 7245e75f11074fbb221987cf32f0c999bbbafa5a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 24 Aug 2021 09:59:59 +1000 Subject: [PATCH 337/365] lib/vsprintf: don't increment buf in bitmap_list_string Increment is confusing as the buf is overritten at the same line. Link: https://lkml.kernel.org/r/20210817193735.269942-1-yury.norov@gmail.com Fixes: b1c4af4d3d6b (vsprintf: rework bitmap_list_string) (next-20210817) Signed-off-by: Yury Norov Suggested-by: Andy Shevchenko Cc: Wolfram Sang Cc: Alexander Lobakin Cc: Alexey Klimov Cc: Dennis Zhou Cc: Jiri Olsa Cc: Ulf Hansson Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/vsprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index cd9f359c5bf9e..9046c7720d961 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1261,7 +1261,7 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, if (buf < end) *buf = '-'; - buf = number(++buf, end, rtop - 1, default_dec_spec); + buf = number(buf + 1, end, rtop - 1, default_dec_spec); } return buf; } From ca49072752894f99531fbd6ca753c613f8c7cf8a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 24 Aug 2021 09:59:59 +1000 Subject: [PATCH 338/365] checkpatch: support wide strings Allow prefixing typical strings with L for wide strings and u for unicode strings. Link: https://lkml.kernel.org/r/20210801170733.1.I3f9784fd3c1007d08ec2e70b151d137687575495@changeid Signed-off-by: Joe Perches Signed-off-by: Simon Glass Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 461d4221e4a4a..a65753c05a693 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -501,7 +501,7 @@ sub hash_show_words { our $Hex = qr{(?i)0x[0-9a-f]+$Int_type?}; our $Int = qr{[0-9]+$Int_type?}; our $Octal = qr{0[0-7]+$Int_type?}; -our $String = qr{"[X\t]*"}; +our $String = qr{(?:\b[Lu])?"[X\t]*"}; our $Float_hex = qr{(?i)0x[0-9a-f]+p-?[0-9]+[fl]?}; our $Float_dec = qr{(?i)(?:[0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(?:e-?[0-9]+)?[fl]?}; our $Float_int = qr{(?i)[0-9]+e-?[0-9]+[fl]?}; @@ -6132,7 +6132,8 @@ sub process { } # concatenated string without spaces between elements - if ($line =~ /$String[A-Za-z0-9_]/ || $line =~ /[A-Za-z0-9_]$String/) { + if ($line =~ /$String[A-Z_]/ || + ($line =~ /([A-Za-z0-9_]+)$String/ && $1 !~ /^[Lu]$/)) { if (CHK("CONCATENATED_STRING", "Concatenated strings should use spaces between elements\n" . $herecurr) && $fix) { @@ -6145,7 +6146,7 @@ sub process { } # uncoalesced string fragments - if ($line =~ /$String\s*"/) { + if ($line =~ /$String\s*[Lu]?"/) { if (WARN("STRING_FRAGMENTS", "Consecutive strings are generally better as a single string\n" . $herecurr) && $fix) { From 6d01cb2324d0cf1272062cf0d08cc8cd34e968d0 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 24 Aug 2021 09:59:59 +1000 Subject: [PATCH 339/365] checkpatch: make email address check case insensitive Instead of checkpatch requiring the patch author to exactly match the signed-off-by tag, commit 48ca2d8ac8a1 ("checkpatch: add new warnings to author signoff checks.") safely relaxed this requirement. Although the local-part of an email address (local-part@domain), may be case sensitive, exploiting the case sensitivity of mailbox local-parts impedes interoperability and is discouraged. Mailbox domains follow normal DNS rules and are hence not case sensitive. (Refer to https://datatracker.ietf.org/doc/html/rfc5321#section-2.4.) Further relax the patch author and signed-off-by tag comparison by making the email address check case insensitive. Link: https://lkml.kernel.org/r/20210816112725.173206-1-zohar@linux.ibm.com Signed-off-by: Mimi Zohar Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index a65753c05a693..161ce7fe5d1e5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2909,10 +2909,10 @@ sub process { my ($email_name, $email_comment, $email_address, $comment1) = parse_email($ctx); my ($author_name, $author_comment, $author_address, $comment2) = parse_email($author); - if ($email_address eq $author_address && $email_name eq $author_name) { + if (lc $email_address eq lc $author_address && $email_name eq $author_name) { $author_sob = $ctx; $authorsignoff = 2; - } elsif ($email_address eq $author_address) { + } elsif (lc $email_address eq lc $author_address) { $author_sob = $ctx; $authorsignoff = 3; } elsif ($email_name eq $author_name) { From 9b6d0f30bd1284425aab0de00138e21cb939eaf0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 24 Aug 2021 09:59:59 +1000 Subject: [PATCH 340/365] checkpatch: improve GIT_COMMIT_ID test The preferred git commit id reference has the form commit ("Title line") where SHA-1 is the commit hex hash with a minimum lenth of 12 and ("Title line") is the complete title line of the commit with a (" prefix and ") suffix. The current tests fail when the "Title line" has one or more embedded double quotes. Improve the test that finds the commit SHA-1 hex hash then ("Title line") by using $balanced_parens for a maximum of 3 consecutive lines. Link: https://lkml.kernel.org/r/976c6cdd680db4b55ae31b5fc2d1779da5c0dc66.camel@perches.com Signed-off-by: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: Denis Efremov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 82 +++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 161ce7fe5d1e5..7c33ffafa3f0c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1181,7 +1181,8 @@ sub git_commit_info { # git log --format='%H %s' -1 $line | # echo "commit $(cut -c 1-12,41-)" # done - } elsif ($lines[0] =~ /^fatal: ambiguous argument '$commit': unknown revision or path not in the working tree\./) { + } elsif ($lines[0] =~ /^fatal: ambiguous argument '$commit': unknown revision or path not in the working tree\./ || + $lines[0] =~ /^fatal: bad object $commit/) { $id = undef; } else { $id = substr($lines[0], 0, 12); @@ -2587,6 +2588,8 @@ sub process { my $reported_maintainer_file = 0; my $non_utf8_charset = 0; + my $last_git_commit_id_linenr = -1; + my $last_blank_line = 0; my $last_coalesced_string_linenr = -1; @@ -3170,10 +3173,20 @@ sub process { } # Check for git id commit length and improperly formed commit descriptions - if ($in_commit_log && !$commit_log_possible_stack_dump && +# A correctly formed commit description is: +# commit ("Complete commit subject") +# with the commit subject '("' prefix and '")' suffix +# This is a fairly compilicated block as it tests for what appears to be +# bare SHA-1 hash with minimum length of 5. It also avoids several types of +# possible SHA-1 matches. +# A commit match can span multiple lines so this block attempts to find a +# complete typical commit on a maximum of 3 lines + if ($perl_version_ok + $in_commit_log && !$commit_log_possible_stack_dump && $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink|base-commit):/i && $line !~ /^This reverts commit [0-9a-f]{7,40}/ && - ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || + (($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || + ($line =~ /\bcommit\s*$/i && defined($rawlines[$linenr]) && $rawlines[$linenr] =~ /^\s*[0-9a-f]{5,}\b/i)) || ($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i && $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i && $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) { @@ -3183,49 +3196,56 @@ sub process { my $long = 0; my $case = 1; my $space = 1; - my $hasdesc = 0; - my $hasparens = 0; my $id = '0123456789ab'; my $orig_desc = "commit description"; my $description = ""; + my $herectx = $herecurr; + my $has_parens = 0; + my $has_quotes = 0; + + my $input = $line; + if ($line =~ /(?:\bcommit\s+[0-9a-f]{5,}|\bcommit\s*$)/i) { + for (my $n = 0; $n < 2; $n++) { + if ($input =~ /\bcommit\s+[0-9a-f]{5,}\s*($balanced_parens)/i) { + $orig_desc = $1; + $has_parens = 1; + # Always strip leading/trailing parens then double quotes if existing + $orig_desc = substr($orig_desc, 1, -1); + if ($orig_desc =~ /^".*"$/) { + $orig_desc = substr($orig_desc, 1, -1); + $has_quotes = 1; + } + last; + } + last if ($#lines < $linenr + $n); + $input .= " " . trim($rawlines[$linenr + $n]); + $herectx .= "$rawlines[$linenr + $n]\n"; + } + $herectx = $herecurr if (!$has_parens); + } - if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) { + if ($input =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) { $init_char = $1; $orig_commit = lc($2); - } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) { + $short = 0 if ($input =~ /\bcommit\s+[0-9a-f]{12,40}/i); + $long = 1 if ($input =~ /\bcommit\s+[0-9a-f]{41,}/i); + $space = 0 if ($input =~ /\bcommit [0-9a-f]/i); + $case = 0 if ($input =~ /\b[Cc]ommit\s+[0-9a-f]{5,40}[^A-F]/); + } elsif ($input =~ /\b([0-9a-f]{12,40})\b/i) { $orig_commit = lc($1); } - $short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i); - $long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i); - $space = 0 if ($line =~ /\bcommit [0-9a-f]/i); - $case = 0 if ($line =~ /\b[Cc]ommit\s+[0-9a-f]{5,40}[^A-F]/); - if ($line =~ /\bcommit\s+[0-9a-f]{5,}\s+\("([^"]+)"\)/i) { - $orig_desc = $1; - $hasparens = 1; - } elsif ($line =~ /\bcommit\s+[0-9a-f]{5,}\s*$/i && - defined $rawlines[$linenr] && - $rawlines[$linenr] =~ /^\s*\("([^"]+)"\)/) { - $orig_desc = $1; - $hasparens = 1; - } elsif ($line =~ /\bcommit\s+[0-9a-f]{5,}\s+\("[^"]+$/i && - defined $rawlines[$linenr] && - $rawlines[$linenr] =~ /^\s*[^"]+"\)/) { - $line =~ /\bcommit\s+[0-9a-f]{5,}\s+\("([^"]+)$/i; - $orig_desc = $1; - $rawlines[$linenr] =~ /^\s*([^"]+)"\)/; - $orig_desc .= " " . $1; - $hasparens = 1; - } - ($id, $description) = git_commit_info($orig_commit, $id, $orig_desc); if (defined($id) && - ($short || $long || $space || $case || ($orig_desc ne $description) || !$hasparens)) { + ($short || $long || $space || $case || ($orig_desc ne $description) || !$has_quotes) && + $last_git_commit_id_linenr != $linenr - 1) { ERROR("GIT_COMMIT_ID", - "Please use git commit description style 'commit <12+ chars of sha1> (\"\")' - ie: '${init_char}ommit $id (\"$description\")'\n" . $herecurr); + "Please use git commit description style 'commit <12+ chars of sha1> (\"<title line>\")' - ie: '${init_char}ommit $id (\"$description\")'\n" . $herectx); } + #don't report the next line if this line ends in commit and the sha1 hash is the next line + $last_git_commit_id_linenr = $linenr if ($line =~ /\bcommit\s*$/i); } # Check for added, moved or deleted files From b81f67c562dcaf7bbb6aa3a07e9dc6558f6432bd Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Tue, 24 Aug 2021 09:59:59 +1000 Subject: [PATCH 341/365] checkpatch-improve-git_commit_id-test-fix add missing && Cc: Denis Efremov <efremov@linux.com> Cc: Dwaipayan Ray <dwaipayanray1@gmail.com> Cc: Joe Perches <joe@perches.com> Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7c33ffafa3f0c..c27d2312cfc30 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3181,7 +3181,7 @@ sub process { # possible SHA-1 matches. # A commit match can span multiple lines so this block attempts to find a # complete typical commit on a maximum of 3 lines - if ($perl_version_ok + if ($perl_version_ok && $in_commit_log && !$commit_log_possible_stack_dump && $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink|base-commit):/i && $line !~ /^This reverts commit [0-9a-f]{7,40}/ && From 5d9d50f0a6031094a720c309afa80bf9294b27b9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Tue, 24 Aug 2021 10:00:00 +1000 Subject: [PATCH 342/365] fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reported-by: Anton Blanchard <anton@ozlabs.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/eventpoll.c | 18 ++++++++++-------- include/linux/sched/user.h | 3 ++- kernel/user.c | 9 +++++++++ 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e596e1d0bba8..648ed77f41644 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) */ call_rcu(&epi->rcu, epi_rcu_free); - atomic_long_dec(&ep->user->epoll_watches); + percpu_counter_dec(&ep->user->epoll_watches); return 0; } @@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - long user_watches; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; @@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, lockdep_assert_irqs_enabled(); - user_watches = atomic_long_read(&ep->user->epoll_watches); - if (unlikely(user_watches >= max_user_watches)) + if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, + max_user_watches) >= 0)) return -ENOSPC; - if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) + percpu_counter_inc(&ep->user->epoll_watches); + + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; + } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { - kmem_cache_free(epi_cache, epi); if (tep) mutex_unlock(&tep->mtx); + kmem_cache_free(epi_cache, epi); + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); - atomic_long_inc(&ep->user->epoll_watches); - /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 2462f7d07695b..00ed419dd4641 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include <linux/uidgid.h> #include <linux/atomic.h> +#include <linux/percpu_counter.h> #include <linux/refcount.h> #include <linux/ratelimit.h> @@ -13,7 +14,7 @@ struct user_struct { refcount_t __count; /* reference count */ #ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ #endif unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ diff --git a/kernel/user.c b/kernel/user.c index c82399c1618a6..a2673f940506e 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -138,6 +138,7 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + percpu_counter_destroy(&up->epoll_watches); kmem_cache_free(uid_cachep, up); } @@ -185,6 +186,10 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; refcount_set(&new->__count, 1); + if (percpu_counter_init(&new->epoll_watches, 0, GFP_KERNEL)) { + kmem_cache_free(uid_cachep, new); + return NULL; + } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); @@ -195,6 +200,7 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + percpu_counter_destroy(&new->epoll_watches); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -216,6 +222,9 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); + if (percpu_counter_init(&root_user.epoll_watches, 0, GFP_KERNEL)) + panic("percpu cpunter alloc failed"); + /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); From c10e07398ef04d754a957430322b17f1acdbcddd Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Tue, 24 Aug 2021 10:00:00 +1000 Subject: [PATCH 343/365] fs-epoll-use-a-per-cpu-counter-for-users-watches-count-fix fix build errors in kernel/user.c when CONFIG_EPOLL=n Also fix typo: "cpunter" -"counter" in a panic message. [npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message] Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none Fixes: e75b89477811 ("fs/epoll: use a per-cpu counter for user's watches count") Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Cc: Mark Brown <broonie@kernel.org> Cc: Guenter Roeck <linux@roeck-us.net> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- kernel/user.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/kernel/user.c b/kernel/user.c index a2673f940506e..57d53568cb33c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -129,6 +129,21 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) return NULL; } +static int user_epoll_alloc(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); +#endif + return 0; +} + +static void user_epoll_free(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + percpu_counter_destroy(&up->epoll_watches); +#endif +} + /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -138,7 +153,7 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); - percpu_counter_destroy(&up->epoll_watches); + user_epoll_free(up); kmem_cache_free(uid_cachep, up); } @@ -186,7 +201,7 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; refcount_set(&new->__count, 1); - if (percpu_counter_init(&new->epoll_watches, 0, GFP_KERNEL)) { + if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } @@ -200,7 +215,7 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - percpu_counter_destroy(&new->epoll_watches); + user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -222,8 +237,8 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); - if (percpu_counter_init(&root_user.epoll_watches, 0, GFP_KERNEL)) - panic("percpu cpunter alloc failed"); + if (user_epoll_alloc(&root_user)) + panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); From 4fbc3ac1e771e1388ef947579012d3ec3bcafb27 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Tue, 24 Aug 2021 10:00:00 +1000 Subject: [PATCH 344/365] fs-epoll-use-a-per-cpu-counter-for-users-watches-count-fix-fix tweak user_epoll_alloc(), per Guenter Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net Cc: Guenter Roeck <linux@roeck-us.net> Cc: Mark Brown <broonie@kernel.org> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- kernel/user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/user.c b/kernel/user.c index 57d53568cb33c..e2cf8c22b539a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -133,8 +133,9 @@ static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); -#endif +#else return 0; +#endif } static void user_epoll_free(struct user_struct *up) From 069062768f8458dbec1d0bdb5bd0acc35a2a9c91 Mon Sep 17 00:00:00 2001 From: yangerkun <yangerkun@huawei.com> Date: Tue, 24 Aug 2021 10:00:00 +1000 Subject: [PATCH 345/365] ramfs: fix mount source show for ramfs ramfs_parse_param does not parse the key "source", and will convert -ENOPARAM to 0. This will skip calling vfs_parse_fs_param_source() in vfs_parse_fs_param(), which leads to always presenting "none" as the mount source for ramfs. Fix it by parsing "source" in ramfs_parse_param(). Link: https://lkml.kernel.org/r/20210811122811.2288041-1-yangerkun@huawei.com Signed-off-by: yangerkun <yangerkun@huawei.com> Cc: Jan Kara <jack@suse.cz> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Yu Kuai <yukuai3@huawei.com> Cc: ErKun Yang <yangerkun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/ramfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 65e7e56005b8f..0d7f5f655fd8b 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -202,6 +202,10 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) struct ramfs_fs_info *fsi = fc->s_fs_info; int opt; + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + opt = fs_parse(fc, ramfs_fs_parameters, param, &result); if (opt < 0) { /* From ef2f1a4d5c71fa44aed7de74e6f5cd6439483cf7 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Tue, 24 Aug 2021 10:00:00 +1000 Subject: [PATCH 346/365] init: move usermodehelper_enable() to populate_rootfs() Currently, usermodehelper is enabled right before PID1 starts going through the initcalls. However, any call of a usermodehelper from a pure_, core_, postcore_, arch_, subsys_ or fs_ initcall is futile, as there is no filesystem contents yet. Up until commit e7cb072eb988 ("init/initramfs.c: do unpacking asynchronously"), such calls, whether via some request_module(), a legacy uevent "/sbin/hotplug" notification or something else, would just fail silently with (presumably) -ENOENT from kernel_execve(). However, that commit introduced the wait_for_initramfs() synchronization hook which must be called from the usermodehelper exec path right before the kernel_execve, in order that request_module() et al done from *after* rootfs_initcall() time (i.e. device_ and late_ initcalls) would continue to find a populated initramfs as they used to. Any call of wait_for_initramfs() done before the unpacking has been scheduled (i.e. before rootfs_initcall time) must just return immediately [and let the caller find an empty file system] in order not to deadlock the machine. I mistakenly thought, and my limited testing confirmed, that there were no such calls, so I added a pr_warn_once() in wait_for_initramfs(). It turns out that one can indeed hit request_module() as well as kobject_uevent_env() during those early init calls, leading to a user-visible warning in the kernel log emitted consistently for certain configurations. We could just remove the pr_warn_once(), but I think it's better to postpone enabling the usermodehelper framework until there is at least some chance of finding the executable. That is also a little more efficient in that a lot of work done in umh.c will be elided. However, it does change the error seen by those early callers from -ENOENT to -EBUSY, so there is a risk of a regression if any caller care about the exact error value. Link: https://lkml.kernel.org/r/20210728134638.329060-1-linux@rasmusvillemoes.dk Fixes: e7cb072eb988 ("init/initramfs.c: do unpacking asynchronously") Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Reported-by: Alexander Egorenkov <egorenar@linux.ibm.com> Reported-by: Bruno Goncalves <bgoncalv@redhat.com> Reported-by: Heiner Kallweit <hkallweit1@gmail.com> Cc: Luis Chamberlain <mcgrof@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- init/initramfs.c | 2 ++ init/main.c | 1 - init/noinitramfs.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index af27abc596436..a842c05447456 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/namei.h> #include <linux/init_syscalls.h> +#include <linux/umh.h> static ssize_t __init xwrite(struct file *file, const char *p, size_t count, loff_t *pos) @@ -727,6 +728,7 @@ static int __init populate_rootfs(void) { initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL, &initramfs_domain); + usermodehelper_enable(); if (!initramfs_async) wait_for_initramfs(); return 0; diff --git a/init/main.c b/init/main.c index 8d97aba78c3ad..90733a916791f 100644 --- a/init/main.c +++ b/init/main.c @@ -1392,7 +1392,6 @@ static void __init do_basic_setup(void) driver_init(); init_irq_proc(); do_ctors(); - usermodehelper_enable(); do_initcalls(); } diff --git a/init/noinitramfs.c b/init/noinitramfs.c index 3d62b07f3bb9c..d1d26b93d25cd 100644 --- a/init/noinitramfs.c +++ b/init/noinitramfs.c @@ -10,6 +10,7 @@ #include <linux/kdev_t.h> #include <linux/syscalls.h> #include <linux/init_syscalls.h> +#include <linux/umh.h> /* * Create a simple rootfs that is similar to the default initramfs @@ -18,6 +19,7 @@ static int __init default_rootfs(void) { int err; + usermodehelper_enable(); err = init_mkdir("/dev", 0755); if (err < 0) goto out; From 82d95a07f64fa9530e3f4ad492bae24dc96c8b4f Mon Sep 17 00:00:00 2001 From: Kefeng Wang <wangkefeng.wang@huawei.com> Date: Tue, 24 Aug 2021 10:00:01 +1000 Subject: [PATCH 347/365] trap: cleanup trap_init() There are some empty trap_init() definitions in different ARCHs, Introduce a new weak trap_init() function to clean them up. Link: https://lkml.kernel.org/r/20210812123602.76356-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> [arm32] Acked-by: Vineet Gupta <vgupt@kernel.org> [arc] Acked-by: Michael Ellerman <mpe@ellerman.id.au> [powerpc] Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: Ley Foon Tan <ley.foon.tan@intel.com> Cc: Jonas Bonn <jonas@southpole.se> Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi> Cc: Stafford Horne <shorne@gmail.com> Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com> Cc: Helge Deller <deller@gmx.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Walmsley <palmerdabbelt@google.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- arch/arc/kernel/traps.c | 5 ----- arch/arm/kernel/traps.c | 5 ----- arch/h8300/kernel/traps.c | 4 ---- arch/hexagon/kernel/traps.c | 4 ---- arch/nds32/kernel/traps.c | 5 ----- arch/nios2/kernel/traps.c | 5 ----- arch/openrisc/kernel/traps.c | 5 ----- arch/parisc/kernel/traps.c | 4 ---- arch/powerpc/kernel/traps.c | 5 ----- arch/riscv/kernel/traps.c | 5 ----- arch/um/kernel/trap.c | 4 ---- init/main.c | 2 ++ 12 files changed, 2 insertions(+), 51 deletions(-) diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index 57235e5c0cea9..6b83e3f2b41c3 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -20,11 +20,6 @@ #include <asm/unaligned.h> #include <asm/kprobes.h> -void __init trap_init(void) -{ - return; -} - void die(const char *str, struct pt_regs *regs, unsigned long address) { show_kernel_fault_diag(str, regs, address); diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 64308e3a5d0c4..e9b4f2b49bd8c 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -781,11 +781,6 @@ void abort(void) panic("Oops failed to kill thread"); } -void __init trap_init(void) -{ - return; -} - #ifdef CONFIG_KUSER_HELPERS static void __init kuser_init(void *vectors) { diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index 5d8b969cd8f34..bdbe988d8dbcf 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -39,10 +39,6 @@ void __init base_trap_init(void) { } -void __init trap_init(void) -{ -} - asmlinkage void set_esp0(unsigned long ssp) { current->thread.esp0 = ssp; diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index 904134b37232f..edfc35dafeb19 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -28,10 +28,6 @@ #define TRAP_SYSCALL 1 #define TRAP_DEBUG 0xdb -void __init trap_init(void) -{ -} - #ifdef CONFIG_GENERIC_BUG /* Maybe should resemble arch/sh/kernel/traps.c ?? */ int is_valid_bugaddr(unsigned long addr) diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index ee0d9ae192a50..f06421c645aff 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -183,11 +183,6 @@ void __pgd_error(const char *file, int line, unsigned long val) } extern char *exception_vector, *exception_vector_end; -void __init trap_init(void) -{ - return; -} - void __init early_trap_init(void) { unsigned long ivb = 0; diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index b172da4eb1a95..596986a74a26d 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -105,11 +105,6 @@ void show_stack(struct task_struct *task, unsigned long *stack, printk("%s\n", loglvl); } -void __init trap_init(void) -{ - /* Nothing to do here */ -} - /* Breakpoint handler */ asmlinkage void breakpoint_c(struct pt_regs *fp) { diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 4d61333c26232..aa1e709405acd 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -231,11 +231,6 @@ void unhandled_exception(struct pt_regs *regs, int ea, int vector) die("Oops", regs, 9); } -void __init trap_init(void) -{ - /* Nothing needs to be done */ -} - asmlinkage void do_trap(struct pt_regs *regs, unsigned long address) { force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc); diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 8d8441d4562aa..747c328fb8862 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -859,7 +859,3 @@ void __init early_trap_init(void) initialize_ivt(&fault_vector_20); } - -void __init trap_init(void) -{ -} diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d56254f05e174..7819231242b2b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2215,11 +2215,6 @@ DEFINE_INTERRUPT_HANDLER(kernel_bad_stack) die("Bad kernel stack pointer", regs, SIGABRT); } -void __init trap_init(void) -{ -} - - #ifdef CONFIG_PPC_EMULATED_STATS #define WARN_EMULATED_SETUP(type) .type = { .name = #type } diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 0a98fd0ddfe90..0daaa3e4630d4 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -199,11 +199,6 @@ int is_valid_bugaddr(unsigned long pc) } #endif /* CONFIG_GENERIC_BUG */ -/* stvec & scratch is already set from head.S */ -void __init trap_init(void) -{ -} - #ifdef CONFIG_VMAP_STACK static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)__aligned(16); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ad12f78bda7e4..3198c47673879 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -311,7 +311,3 @@ void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { do_IRQ(WINCH_IRQ, regs); } - -void trap_init(void) -{ -} diff --git a/init/main.c b/init/main.c index 90733a916791f..4a52392abd029 100644 --- a/init/main.c +++ b/init/main.c @@ -777,6 +777,8 @@ void __init __weak poking_init(void) { } void __init __weak pgtable_cache_init(void) { } +void __init __weak trap_init(void) { } + bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); From 4cd8e7febb4d3e5d51e3630cb296c415edca9b3b Mon Sep 17 00:00:00 2001 From: Andrew Halaney <ahalaney@redhat.com> Date: Tue, 24 Aug 2021 10:00:01 +1000 Subject: [PATCH 348/365] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney <ahalaney@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index 4a52392abd029..daa4ad6da40b6 100644 --- a/init/main.c +++ b/init/main.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -231,13 +231,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -476,7 +476,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL, NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -501,7 +501,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -526,7 +527,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -721,7 +723,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1341,8 +1344,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1479,7 +1484,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From 09215c5b28591a3ed8823e899e081b164b7fc1fc Mon Sep 17 00:00:00 2001 From: Nanyong Sun <sunnanyong@huawei.com> Date: Tue, 24 Aug 2021 10:00:01 +1000 Subject: [PATCH 349/365] nilfs2: fix memory leak in nilfs_sysfs_create_device_group Patch series "nilfs2: fix incorrect usage of kobject". This patchset from Nanyong Sun fixes memory leak issues and a NULL pointer dereference issue caused by incorrect usage of kboject in nilfs2 sysfs implementation. This patch (of 6): Reported by syzkaller: BUG: memory leak unreferenced object 0xffff888100ca8988 (size 8): comm "syz-executor.1", pid 1930, jiffies 4294745569 (age 18.052s) hex dump (first 8 bytes): 6c 6f 6f 70 31 00 ff ff loop1... backtrace: [<000000009d9e0ac4>] slab_alloc_node mm/slub.c:2972 [inline] [<000000009d9e0ac4>] slab_alloc mm/slub.c:2980 [inline] [<000000009d9e0ac4>] __kmalloc_track_caller+0x164/0x330 mm/slub.c:4644 [<00000000b1825477>] kstrdup+0x36/0x70 mm/util.c:60 [<00000000fa081499>] kstrdup_const+0x35/0x60 mm/util.c:83 [<0000000024d13570>] kvasprintf_const+0xf1/0x180 lib/kasprintf.c:48 [<0000000024b69715>] kobject_set_name_vargs+0x56/0x150 lib/kobject.c:289 [<000000003fedac3d>] kobject_add_varg lib/kobject.c:384 [inline] [<000000003fedac3d>] kobject_init_and_add+0xc9/0x150 lib/kobject.c:473 [<000000002795bd99>] nilfs_sysfs_create_device_group+0x150/0x7d0 fs/nilfs2/sysfs.c:986 [<00000000567fa12d>] init_nilfs+0xa21/0xea0 fs/nilfs2/the_nilfs.c:637 [<00000000082e7458>] nilfs_fill_super fs/nilfs2/super.c:1046 [inline] [<00000000082e7458>] nilfs_mount+0x7b4/0xe80 fs/nilfs2/super.c:1316 [<00000000adc3fd88>] legacy_get_tree+0x105/0x210 fs/fs_context.c:592 [<00000000a98c45b8>] vfs_get_tree+0x8e/0x2d0 fs/super.c:1498 [<00000000e96282d3>] do_new_mount fs/namespace.c:2905 [inline] [<00000000e96282d3>] path_mount+0xf9b/0x1990 fs/namespace.c:3235 [<000000003d2eb1b0>] do_mount+0xea/0x100 fs/namespace.c:3248 [<00000000e1ce771a>] __do_sys_mount fs/namespace.c:3456 [inline] [<00000000e1ce771a>] __se_sys_mount fs/namespace.c:3433 [inline] [<00000000e1ce771a>] __x64_sys_mount+0x14b/0x1f0 fs/namespace.c:3433 [<000000007c7f81e8>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<000000007c7f81e8>] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 [<00000000fd23ff06>] entry_SYSCALL_64_after_hwframe+0x44/0xae If kobject_init_and_add return with error, then the cleanup of kobject is needed because memory may be allocated in kobject_init_and_add without freeing. And the place of cleanup_dev_kobject should use kobject_put to free the memory associated with the kobject. As the section "Kobject removal" of "Documentation/core-api/kobject.rst" says, kobject_del() just makes the kobject "invisible", but it is not cleaned up. And no more cleanup will do after cleanup_dev_kobject, so kobject_put is needed here. Link: https://lkml.kernel.org/r/1625651306-10829-1-git-send-email-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/1625651306-10829-2-git-send-email-konishi.ryusuke@gmail.com Reported-by: Hulk Robot <hulkci@huawei.com> Link: https://lkml.kernel.org/r/20210629022556.3985106-2-sunnanyong@huawei.com Signed-off-by: Nanyong Sun <sunnanyong@huawei.com> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/nilfs2/sysfs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 68e8d61e28dd5..d2d8ea89937ae 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -986,7 +986,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, "%s", sb->s_id); if (err) - goto free_dev_subgroups; + goto cleanup_dev_kobject; err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); if (err) @@ -1023,9 +1023,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) nilfs_sysfs_delete_mounted_snapshots_group(nilfs); cleanup_dev_kobject: - kobject_del(&nilfs->ns_dev_kobj); - -free_dev_subgroups: + kobject_put(&nilfs->ns_dev_kobj); kfree(nilfs->ns_dev_subgroups); failed_create_device_group: From 7a93b66e4b71634d250ef23dea7008c0428aeba6 Mon Sep 17 00:00:00 2001 From: Nanyong Sun <sunnanyong@huawei.com> Date: Tue, 24 Aug 2021 10:00:01 +1000 Subject: [PATCH 350/365] nilfs2: fix NULL pointer in nilfs_##name##_attr_release In nilfs_##name##_attr_release, kobj->parent should not be referenced because it is a NULL pointer. The release() method of kobject is always called in kobject_put(kobj), in the implementation of kobject_put(), the kobj->parent will be assigned as NULL before call the release() method. So just use kobj to get the subgroups, which is more efficient and can fix a NULL pointer reference problem. Link: https://lkml.kernel.org/r/20210629022556.3985106-3-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-3-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun <sunnanyong@huawei.com> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/nilfs2/sysfs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index d2d8ea89937ae..ec85ac53720d2 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -51,11 +51,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \ #define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ static void nilfs_##name##_attr_release(struct kobject *kobj) \ { \ - struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ - struct the_nilfs *nilfs = container_of(kobj->parent, \ - struct the_nilfs, \ - ns_##parent_name##_kobj); \ - subgroups = nilfs->ns_##parent_name##_subgroups; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \ + struct nilfs_sysfs_##parent_name##_subgroups, \ + sg_##name##_kobj); \ complete(&subgroups->sg_##name##_kobj_unregister); \ } \ static struct kobj_type nilfs_##name##_ktype = { \ From 2ef34726ec35b0b25aa4dc5804399cafbf7eba3a Mon Sep 17 00:00:00 2001 From: Nanyong Sun <sunnanyong@huawei.com> Date: Tue, 24 Aug 2021 10:00:01 +1000 Subject: [PATCH 351/365] nilfs2: fix memory leak in nilfs_sysfs_create_##name##_group If kobject_init_and_add return with error, kobject_put() is needed here to avoid memory leak, because kobject_init_and_add may return error without freeing the memory associated with the kobject it allocated. Link: https://lkml.kernel.org/r/20210629022556.3985106-4-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-4-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun <sunnanyong@huawei.com> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/nilfs2/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index ec85ac53720d2..6305e4ef7e39b 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -79,8 +79,8 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ #name); \ if (err) \ - return err; \ - return 0; \ + kobject_put(kobj); \ + return err; \ } \ static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ { \ From 46accddc68b58c7e6509e63937105b02708681af Mon Sep 17 00:00:00 2001 From: Nanyong Sun <sunnanyong@huawei.com> Date: Tue, 24 Aug 2021 10:00:02 +1000 Subject: [PATCH 352/365] nilfs2: fix memory leak in nilfs_sysfs_delete_##name##_group The kobject_put() should be used to cleanup the memory associated with the kobject instead of kobject_del. See the section "Kobject removal" of "Documentation/core-api/kobject.rst". Link: https://lkml.kernel.org/r/20210629022556.3985106-5-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-5-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun <sunnanyong@huawei.com> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/nilfs2/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 6305e4ef7e39b..d989e6500bd7a 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -84,7 +84,7 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ } \ static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ { \ - kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ + kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ } /************************************************************************ From f31bdb75f5b877c8c67f8df0688867807eb3a23b Mon Sep 17 00:00:00 2001 From: Nanyong Sun <sunnanyong@huawei.com> Date: Tue, 24 Aug 2021 10:00:02 +1000 Subject: [PATCH 353/365] nilfs2: fix memory leak in nilfs_sysfs_create_snapshot_group If kobject_init_and_add returns with error, kobject_put() is needed here to avoid memory leak, because kobject_init_and_add may return error without freeing the memory associated with the kobject it allocated. Link: https://lkml.kernel.org/r/20210629022556.3985106-6-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-6-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun <sunnanyong@huawei.com> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/nilfs2/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index d989e6500bd7a..5ba87573ad3bc 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -195,9 +195,9 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) } if (err) - return err; + kobject_put(&root->snapshot_kobj); - return 0; + return err; } void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) From a065e7e695790c9e75ef6027962bfc93d64c57dc Mon Sep 17 00:00:00 2001 From: Nanyong Sun <sunnanyong@huawei.com> Date: Tue, 24 Aug 2021 10:00:02 +1000 Subject: [PATCH 354/365] nilfs2: fix memory leak in nilfs_sysfs_delete_snapshot_group kobject_put() should be used to cleanup the memory associated with the kobject instead of kobject_del(). See the section "Kobject removal" of "Documentation/core-api/kobject.rst". Link: https://lkml.kernel.org/r/20210629022556.3985106-7-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-7-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun <sunnanyong@huawei.com> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/nilfs2/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 5ba87573ad3bc..62f8a7ac19c85 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -202,7 +202,7 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) { - kobject_del(&root->snapshot_kobj); + kobject_put(&root->snapshot_kobj); } /************************************************************************ From 3e79eb1b76aac59777043c79533ba88d20667d4b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" <gustavoars@kernel.org> Date: Tue, 24 Aug 2021 10:00:02 +1000 Subject: [PATCH 355/365] hfsplus: fix out-of-bounds warnings in __hfsplus_setxattr Fix the following out-of-bounds warnings by enclosing structure members file and finder into new struct info: fs/hfsplus/xattr.c:300:5: warning: 'memcpy' offset [65, 80] from the object at 'entry' is out of the bounds of referenced subobject 'user_info' with type 'struct DInfo' at offset 48 [-Warray-bounds] fs/hfsplus/xattr.c:313:5: warning: 'memcpy' offset [65, 80] from the object at 'entry' is out of the bounds of referenced subobject 'user_info' with type 'struct FInfo' at offset 48 [-Warray-bounds] Refactor the code by making it more "structured." Also, this helps with the ongoing efforts to enable -Warray-bounds and makes the code clearer and avoid confusing the compiler. Matthew said: : The offending line is this: : : - memcpy(&entry.file.user_info, value, : + memcpy(&entry.file.info, value, : file_finderinfo_len); : : what it's trying to do is copy two structs which are adjacent to each : other in a single call to memcpy(). gcc legitimately complains that : the memcpy to this struct overruns the bounds of the struct. What : Gustavo has done here is introduce a new struct that contains the two : structs, and now gcc is happy that the memcpy doesn't overrun the : length of this containing struct. Link: https://github.com/KSPP/linux/issues/109 Link: https://lkml.kernel.org/r/20210330145226.GA207011@embeddedor Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Reported-by: kernel test robot <lkp@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/hfsplus/catalog.c | 16 ++++++++-------- fs/hfsplus/dir.c | 4 ++-- fs/hfsplus/hfsplus_raw.h | 12 ++++++++---- fs/hfsplus/xattr.c | 18 ++++++++---------- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 35472cba750e1..9cdc6550b468e 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -124,7 +124,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, hfsplus_cat_set_perms(inode, &folder->permissions); if (inode == sbi->hidden_dir) /* invisible and namelocked */ - folder->user_info.frFlags = cpu_to_be16(0x5000); + folder->info.user.frFlags = cpu_to_be16(0x5000); return sizeof(*folder); } else { struct hfsplus_cat_file *file; @@ -142,14 +142,14 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, if (cnid == inode->i_ino) { hfsplus_cat_set_perms(inode, &file->permissions); if (S_ISLNK(inode->i_mode)) { - file->user_info.fdType = + file->info.user.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); - file->user_info.fdCreator = + file->info.user.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); } else { - file->user_info.fdType = + file->info.user.fdType = cpu_to_be32(sbi->type); - file->user_info.fdCreator = + file->info.user.fdCreator = cpu_to_be32(sbi->creator); } if (HFSPLUS_FLG_IMMUTABLE & @@ -158,11 +158,11 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); } else { - file->user_info.fdType = + file->info.user.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); - file->user_info.fdCreator = + file->info.user.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); - file->user_info.fdFlags = + file->info.user.fdFlags = cpu_to_be16(0x100); file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 84714bbccc123..135279a19b559 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -73,9 +73,9 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, goto fail; } cnid = be32_to_cpu(entry.file.id); - if (entry.file.user_info.fdType == + if (entry.file.info.user.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && - entry.file.user_info.fdCreator == + entry.file.info.user.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && HFSPLUS_SB(sb)->hidden_dir && (entry.file.create_date == diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 456e87aec7fd7..005a043bc7eed 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -260,8 +260,10 @@ struct hfsplus_cat_folder { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct DInfo user_info; - struct DXInfo finder_info; + struct { + struct DInfo user; + struct DXInfo finder; + } info; __be32 text_encoding; __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ } __packed; @@ -294,8 +296,10 @@ struct hfsplus_cat_file { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct FInfo user_info; - struct FXInfo finder_info; + struct { + struct FInfo user; + struct FXInfo finder; + } info; __be32 text_encoding; u32 reserved2; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e2855ceefd394..b21811baab0f2 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -261,10 +261,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, struct hfs_find_data cat_fd; hfsplus_cat_entry entry; u16 cat_entry_flags, cat_entry_type; - u16 folder_finderinfo_len = sizeof(struct DInfo) + - sizeof(struct DXInfo); - u16 file_finderinfo_len = sizeof(struct FInfo) + - sizeof(struct FXInfo); + u16 folder_finderinfo_len = sizeof(entry.folder.info); + u16 file_finderinfo_len = sizeof(entry.file.info); if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || @@ -296,7 +294,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, sizeof(hfsplus_cat_entry)); if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { if (size == folder_finderinfo_len) { - memcpy(&entry.folder.user_info, value, + memcpy(&entry.folder.info, value, folder_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, @@ -309,7 +307,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, } } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { if (size == file_finderinfo_len) { - memcpy(&entry.file.user_info, value, + memcpy(&entry.file.info, value, file_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, @@ -462,14 +460,14 @@ static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, if (entry_type == HFSPLUS_FOLDER) { hfs_bnode_read(fd.bnode, folder_finder_info, fd.entryoffset + - offsetof(struct hfsplus_cat_folder, user_info), + offsetof(struct hfsplus_cat_folder, info.user), folder_rec_len); memcpy(value, folder_finder_info, folder_rec_len); res = folder_rec_len; } else if (entry_type == HFSPLUS_FILE) { hfs_bnode_read(fd.bnode, file_finder_info, fd.entryoffset + - offsetof(struct hfsplus_cat_file, user_info), + offsetof(struct hfsplus_cat_file, info.user), file_rec_len); memcpy(value, file_finder_info, file_rec_len); res = file_rec_len; @@ -630,14 +628,14 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, len = sizeof(struct DInfo) + sizeof(struct DXInfo); hfs_bnode_read(fd.bnode, folder_finder_info, fd.entryoffset + - offsetof(struct hfsplus_cat_folder, user_info), + offsetof(struct hfsplus_cat_folder, info.user), len); found_bit = find_first_bit((void *)folder_finder_info, len*8); } else if (entry_type == HFSPLUS_FILE) { len = sizeof(struct FInfo) + sizeof(struct FXInfo); hfs_bnode_read(fd.bnode, file_finder_info, fd.entryoffset + - offsetof(struct hfsplus_cat_file, user_info), + offsetof(struct hfsplus_cat_file, info.user), len); found_bit = find_first_bit((void *)file_finder_info, len*8); } else { From ef6b0168142e30d20f0e8d10072b28c2332eaacd Mon Sep 17 00:00:00 2001 From: David Oberhollenzer <david.oberhollenzer@sigma-star.at> Date: Tue, 24 Aug 2021 10:00:02 +1000 Subject: [PATCH 356/365] fs/coredump.c: log if a core dump is aborted due to changed file permissions For obvious security reasons, a core dump is aborted if the filesystem cannot preserve ownership or permissions of the dump file. This affects filesystems like e.g. vfat, but also something like a 9pfs share in a Qemu test setup, running as a regular user, depending on the security model used. In those cases, the result is an empty core file and a confused user. To hopefully safe other people a lot of time figuring out the cause, this patch adds a simple log message for those specific cases. Link: https://lkml.kernel.org/r/20210701233151.102720-1-david.oberhollenzer@sigma-star.at Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/coredump.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 07afb5ddb1c4e..681c90e6c7fda 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -782,10 +782,17 @@ void do_coredump(const kernel_siginfo_t *siginfo) * filesystem. */ mnt_userns = file_mnt_user_ns(cprm.file); - if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid())) + if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), + current_fsuid())) { + pr_info_ratelimited("Core dump to |%s aborted: cannot preserve file owner\n", + cn.corename); goto close_fail; - if ((inode->i_mode & 0677) != 0600) + } + if ((inode->i_mode & 0677) != 0600) { + pr_info_ratelimited("Core dump to |%s aborted: cannot preserve file permissions\n", + cn.corename); goto close_fail; + } if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; if (do_truncate(mnt_userns, cprm.file->f_path.dentry, From e7b81ca105f96a1bf1323cc2fbd42849c158a1aa Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Tue, 24 Aug 2021 10:00:02 +1000 Subject: [PATCH 357/365] log-if-a-core-dump-is-aborted-due-to-changed-file-permissions-fix s/|%s/%s/ in printk text Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: David Oberhollenzer <david.oberhollenzer@sigma-star.at> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 681c90e6c7fda..4b3c75732c97d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -784,12 +784,12 @@ void do_coredump(const kernel_siginfo_t *siginfo) mnt_userns = file_mnt_user_ns(cprm.file); if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid())) { - pr_info_ratelimited("Core dump to |%s aborted: cannot preserve file owner\n", + pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); goto close_fail; } if ((inode->i_mode & 0677) != 0600) { - pr_info_ratelimited("Core dump to |%s aborted: cannot preserve file permissions\n", + pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n", cn.corename); goto close_fail; } From 6b33bde8d1f8079f6c5b429caa2f1cfc39764526 Mon Sep 17 00:00:00 2001 From: QiuXi <qiuxi1@huawei.com> Date: Tue, 24 Aug 2021 10:00:03 +1000 Subject: [PATCH 358/365] coredump: fix memleak in dump_vma_snapshot() dump_vma_snapshot() allocs memory for *vma_meta, when dump_vma_snapshot() returns -EFAULT, the memory will be leaked, so we free it correctly. Link: https://lkml.kernel.org/r/20210810020441.62806-1-qiuxi1@huawei.com Fixes: a07279c9a8cd7 ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot") Signed-off-by: QiuXi <qiuxi1@huawei.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Jann Horn <jannh@google.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- fs/coredump.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/coredump.c b/fs/coredump.c index 4b3c75732c97d..3224dee44d30e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1134,8 +1134,10 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) + if (WARN_ON(i != *vma_count)) { + kvfree(*vma_meta); return -EFAULT; + } *vma_data_size_ptr = vma_data_size; return 0; From 085601486dde8c2a66fcd82c06a73e429c32b058 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 24 Aug 2021 10:00:03 +1000 Subject: [PATCH 359/365] kernel/fork.c: unexport get_{mm,task}_exe_file Only used by core code and the tomoyo which can't be a module either. Link: https://lkml.kernel.org/r/20210820095430.445242-1-hch@lst.de Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index abed9591f82c0..25f4abd2377f4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1238,7 +1238,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm) rcu_read_unlock(); return exe_file; } -EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_exe_file - acquire a reference to the task's executable file @@ -1261,7 +1260,6 @@ struct file *get_task_exe_file(struct task_struct *task) task_unlock(task); return exe_file; } -EXPORT_SYMBOL(get_task_exe_file); /** * get_task_mm - acquire a reference to the task's mm From 53884bf416a698f63f30cc23a9f5afa64055f72a Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri <itazur@amazon.com> Date: Tue, 24 Aug 2021 10:00:03 +1000 Subject: [PATCH 360/365] pid: cleanup the stale comment mentioning pidmap_init(). pidmap_init() has already been replaced with pid_idr_init() in the commit 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API"). Cleanup the stale comment which still mentions it. Link: https://lkml.kernel.org/r/20210714120713.19825-1-itazur@amazon.com Signed-off-by: Takahiro Itazuri <itazur@amazon.com> Cc: Kuniyuki Iwashima <kuniyu@amazon.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- include/linux/threads.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/threads.h b/include/linux/threads.h index 18d5a74bcc3dd..c34173e6c5f18 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h @@ -38,7 +38,7 @@ * Define a minimum number of pids per cpu. Heuristically based * on original pid max of 32k for 32 cpus. Also, increase the * minimum settable value for pid_max on the running system based - * on similar defaults. See kernel/pid.c:pidmap_init() for details. + * on similar defaults. See kernel/pid.c:pid_idr_init() for details. */ #define PIDS_PER_CPU_DEFAULT 1024 #define PIDS_PER_CPU_MIN 8 From 847d3bd12b61af37e9e4200981b61e6128af4223 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 24 Aug 2021 10:00:03 +1000 Subject: [PATCH 361/365] prctl: allow to setup brk for et_dyn executables Keno Fischer reported that when a binray loaded via ld-linux-x the prctl(PR_SET_MM_MAP) doesn't allow to setup brk value because it lays before mm:end_data. For example a test program shows | # ~/t | | start_code 401000 | end_code 401a15 | start_stack 7ffce4577dd0 | start_data 403e10 | end_data 40408c | start_brk b5b000 | sbrk(0) b5b000 and when executed via ld-linux | # /lib64/ld-linux-x86-64.so.2 ~/t | | start_code 7fc25b0a4000 | end_code 7fc25b0c4524 | start_stack 7fffcc6b2400 | start_data 7fc25b0ce4c0 | end_data 7fc25b0cff98 | start_brk 55555710c000 | sbrk(0) 55555710c000 This of course prevent criu from restoring such programs. Looking into how kernel operates with brk/start_brk inside brk() syscall I don't see any problem if we allow to setup brk/start_brk without checking for end_data. Even if someone pass some weird address here on a purpose then the worst possible result will be an unexpected unmapping of existing vma (own vma, since prctl works with the callers memory) but test for RLIMIT_DATA is still valid and a user won't be able to gain more memory in case of expanding VMAs via new values shipped with prctl call. Link: https://lkml.kernel.org/r/20210121221207.GB2174@grain Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Reported-by: Keno Fischer <keno@juliacomputing.com> Acked-by: Andrey Vagin <avagin@gmail.com> Tested-by: Andrey Vagin <avagin@gmail.com> Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Kirill Tkhai <ktkhai@virtuozzo.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- kernel/sys.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c7..6ec50924b5176 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1959,13 +1959,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) error = -EINVAL; - /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - /* * Neither we should allow to override limits if they set. */ From 734024e09b8015fc040a59eea4dbd88f02f8563d Mon Sep 17 00:00:00 2001 From: Zenghui Yu <yuzenghui@huawei.com> Date: Tue, 24 Aug 2021 10:00:03 +1000 Subject: [PATCH 362/365] configs: remove the obsolete CONFIG_INPUT_POLLDEV This CONFIG option was removed in commit 278b13ce3a89 ("Input: remove input_polled_dev implementation") so there's no point to keep it in defconfigs any longer. Get rid of the leftover for all arches. Link: https://lkml.kernel.org/r/20210726074741.1062-1-yuzenghui@huawei.com Signed-off-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- arch/arm/configs/dove_defconfig | 1 - arch/arm/configs/pxa_defconfig | 1 - arch/mips/configs/lemote2f_defconfig | 1 - arch/mips/configs/pic32mzda_defconfig | 1 - arch/mips/configs/rt305x_defconfig | 1 - arch/mips/configs/xway_defconfig | 1 - arch/parisc/configs/generic-32bit_defconfig | 1 - arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 9 files changed, 9 deletions(-) diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig index b935162a8bbaf..33074fdab2ea3 100644 --- a/arch/arm/configs/dove_defconfig +++ b/arch/arm/configs/dove_defconfig @@ -56,7 +56,6 @@ CONFIG_ATA=y CONFIG_SATA_MV=y CONFIG_NETDEVICES=y CONFIG_MV643XX_ETH=y -CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV is not set CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index 363f1b1b08e38..58f4834289e63 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -284,7 +284,6 @@ CONFIG_RT2800USB=m CONFIG_MWIFIEX=m CONFIG_MWIFIEX_SDIO=m CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_MATRIXKMAP=y CONFIG_INPUT_MOUSEDEV=m CONFIG_INPUT_MOUSEDEV_SCREEN_X=640 diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index aaf9d5e0aa2ce..791894c4d8fbc 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -116,7 +116,6 @@ CONFIG_8139TOO=y CONFIG_R8169=y CONFIG_USB_USBNET=m CONFIG_USB_NET_CDC_EEM=m -CONFIG_INPUT_POLLDEV=m CONFIG_INPUT_EVDEV=y # CONFIG_MOUSE_PS2_ALPS is not set # CONFIG_MOUSE_PS2_LOGIPS2PP is not set diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig index 63fe2da1b37fc..fd567247adc74 100644 --- a/arch/mips/configs/pic32mzda_defconfig +++ b/arch/mips/configs/pic32mzda_defconfig @@ -34,7 +34,6 @@ CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_SCAN_ASYNC=y # CONFIG_SCSI_LOWLEVEL is not set CONFIG_INPUT_LEDS=m -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_MOUSEDEV=m CONFIG_INPUT_EVDEV=y CONFIG_INPUT_EVBUG=m diff --git a/arch/mips/configs/rt305x_defconfig b/arch/mips/configs/rt305x_defconfig index fec5851c164be..eb359db15dba5 100644 --- a/arch/mips/configs/rt305x_defconfig +++ b/arch/mips/configs/rt305x_defconfig @@ -90,7 +90,6 @@ CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_ISDN=y CONFIG_INPUT=m -CONFIG_INPUT_POLLDEV=m # CONFIG_KEYBOARD_ATKBD is not set # CONFIG_INPUT_MOUSE is not set CONFIG_INPUT_MISC=y diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig index 9abbc0debc2ae..eeb689f715cb9 100644 --- a/arch/mips/configs/xway_defconfig +++ b/arch/mips/configs/xway_defconfig @@ -96,7 +96,6 @@ CONFIG_PPPOE=m CONFIG_PPP_ASYNC=m CONFIG_ISDN=y CONFIG_INPUT=m -CONFIG_INPUT_POLLDEV=m # CONFIG_KEYBOARD_ATKBD is not set # CONFIG_INPUT_MOUSE is not set CONFIG_INPUT_MISC=y diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index 7611d48c599e0..dd14e31313255 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -111,7 +111,6 @@ CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m # CONFIG_WLAN is not set -CONFIG_INPUT_POLLDEV=y CONFIG_KEYBOARD_HIL_OLD=m CONFIG_KEYBOARD_HIL=m CONFIG_MOUSE_SERIAL=y diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9c9c4a888b1db..e81885384f604 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -156,7 +156,6 @@ CONFIG_FORCEDETH=y CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set CONFIG_R8169=y -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_EVDEV=y CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_TABLET=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index b60bd2d860349..e8a7a0af2bdaa 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -148,7 +148,6 @@ CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y CONFIG_R8169=y -CONFIG_INPUT_POLLDEV=y CONFIG_INPUT_EVDEV=y CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_TABLET=y From 74a9015e74a0304f0df69d21c1fc34de8509f591 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn <lukas.bulwahn@gmail.com> Date: Tue, 24 Aug 2021 10:00:04 +1000 Subject: [PATCH 363/365] Kconfig.debug: drop selecting non-existing HARDLOCKUP_DETECTOR_ARCH Commit 05a4a9527931 ("kernel/watchdog: split up config options") adds a new config HARDLOCKUP_DETECTOR, which selects the non-existing config HARDLOCKUP_DETECTOR_ARCH. Hence, ./scripts/checkkconfigsymbols.py warns: HARDLOCKUP_DETECTOR_ARCH Referencing files: lib/Kconfig.debug Simply drop selecting the non-existing HARDLOCKUP_DETECTOR_ARCH. Link: https://lkml.kernel.org/r/20210806115618.22088-1-lukas.bulwahn@gmail.com Fixes: 05a4a9527931 ("kernel/watchdog: split up config options") Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Masahiro Yamada <masahiroy@kernel.org> Cc: Babu Moger <babu.moger@oracle.com> Cc: Don Zickus <dzickus@redhat.com> Cc: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ea630a43d3de2..5b911dd2fd7c8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1062,7 +1062,6 @@ config HARDLOCKUP_DETECTOR depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH select LOCKUP_DETECTOR select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF - select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH help Say Y here to enable the kernel to act as a watchdog to detect hard lockups. From eedfcedc7315a8b69769eda5e6ae61ff7963025f Mon Sep 17 00:00:00 2001 From: Greg Thelen <gthelen@google.com> Date: Tue, 24 Aug 2021 10:00:04 +1000 Subject: [PATCH 364/365] selftests/memfd: remove unused variable Commit 544029862cbb ("selftests/memfd: add tests for F_SEAL_FUTURE_WRITE seal") added an unused variable to mfd_assert_reopen_fd(). Delete the unused variable. Link: https://lkml.kernel.org/r/20210702045509.1517643-1-gthelen@google.com Fixes: 544029862cbb ("selftests/memfd: add tests for F_SEAL_FUTURE_WRITE seal") Signed-off-by: Greg Thelen <gthelen@google.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Joel Fernandes (Google)" <joel@joelfernandes.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- tools/testing/selftests/memfd/memfd_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 74baab83fec35..192a2899bae8f 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -56,7 +56,7 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) static int mfd_assert_reopen_fd(int fd_in) { - int r, fd; + int fd; char path[100]; sprintf(path, "/proc/self/fd/%d", fd_in); From 6326eb4a3915757f57be6294ae327a8484a64a4e Mon Sep 17 00:00:00 2001 From: Rafael Aquini <aquini@redhat.com> Date: Tue, 24 Aug 2021 10:00:04 +1000 Subject: [PATCH 365/365] ipc: replace costly bailout check in sysvipc_find_ipc() sysvipc_find_ipc() was left with a costly way to check if the offset position fed to it is bigger than the total number of IPC IDs in use. So much so that the time it takes to iterate over /proc/sysvipc/* files grows exponentially for a custom benchmark that creates "N" SYSV shm segments and then times the read of /proc/sysvipc/shm (milliseconds): 12 msecs to read 1024 segs from /proc/sysvipc/shm 18 msecs to read 2048 segs from /proc/sysvipc/shm 65 msecs to read 4096 segs from /proc/sysvipc/shm 325 msecs to read 8192 segs from /proc/sysvipc/shm 1303 msecs to read 16384 segs from /proc/sysvipc/shm 5182 msecs to read 32768 segs from /proc/sysvipc/shm The root problem lies with the loop that computes the total amount of ids in use to check if the "pos" feeded to sysvipc_find_ipc() grew bigger than "ids->in_use". That is a quite inneficient way to get to the maximum index in the id lookup table, specially when that value is already provided by struct ipc_ids.max_idx. This patch follows up on the optimization introduced via commit 15df03c879836 ("sysvipc: make get_maxid O(1) again") and gets rid of the aforementioned costly loop replacing it by a simpler checkpoint based on ipc_get_maxidx() returned value, which allows for a smooth linear increase in time complexity for the same custom benchmark: 2 msecs to read 1024 segs from /proc/sysvipc/shm 2 msecs to read 2048 segs from /proc/sysvipc/shm 4 msecs to read 4096 segs from /proc/sysvipc/shm 9 msecs to read 8192 segs from /proc/sysvipc/shm 19 msecs to read 16384 segs from /proc/sysvipc/shm 39 msecs to read 32768 segs from /proc/sysvipc/shm Link: https://lkml.kernel.org/r/20210809203554.1562989-1-aquini@redhat.com Signed-off-by: Rafael Aquini <aquini@redhat.com> Acked-by: Davidlohr Bueso <dbueso@suse.de> Cc: Manfred Spraul <manfred@colorfullife.com> Cc: Waiman Long <llong@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> --- ipc/util.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 0027e47626b7b..d48d8cfa1f3fa 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -788,21 +788,13 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, loff_t *new_pos) { - struct kern_ipc_perm *ipc; - int total, id; - - total = 0; - for (id = 0; id < pos && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) - total++; - } + struct kern_ipc_perm *ipc = NULL; + int max_idx = ipc_get_maxidx(ids); - ipc = NULL; - if (total >= ids->in_use) + if (max_idx == -1 || pos > max_idx) goto out; - for (; pos < ipc_mni; pos++) { + for (; pos <= max_idx; pos++) { ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { rcu_read_lock();