From a4cfc0942b2d79dee5da2caaa23839b2b09640be Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 27 Jan 2022 16:25:37 +1100 Subject: [PATCH 001/154] include/linux/sysctl.h: fix register_sysctl_mount_point() return type The CONFIG_SYSCTL=n stub returns the wrong type. Fixes: ee9efac48a082 ("sysctl: add helper to register a sysctl mount point") Reported-by: kernel test robot Cc: Luis Chamberlain Cc: Tong Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 180adf7da785c..6353d6db69b2e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -265,7 +265,7 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } -static inline struct sysctl_header *register_sysctl_mount_point(const char *path) +static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; } From 36d89e59b024104d539a0d0f032e784c238e9ba1 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Thu, 27 Jan 2022 16:25:38 +1100 Subject: [PATCH 002/154] binfmt_misc: fix crash when load/unload module We should unregister the table upon module unload otherwise something horrible will happen when we load binfmt_misc module again. Also note that we should keep value returned by register_sysctl_mount_point() and release it later, otherwise it will leak. Also, per Christian's comment, to fully restore the old behavior that won't break userspace the check(binfmt_misc_header) should be eliminated. reproduce: modprobe binfmt_misc modprobe -r binfmt_misc modprobe binfmt_misc modprobe -r binfmt_misc modprobe binfmt_misc [ 18.032038] Call Trace: [ 18.032108] [ 18.032169] dump_stack_lvl+0x34/0x44 [ 18.032273] __register_sysctl_table+0x6f4/0x720 [ 18.032397] ? preempt_count_sub+0xf/0xb0 [ 18.032508] ? 0xffffffffc0040000 [ 18.032600] init_misc_binfmt+0x2d/0x1000 [binfmt_misc] [ 18.042520] binfmt_misc: Failed to create fs/binfmt_misc sysctl mount point modprobe: can't load module binfmt_misc (kernel/fs/binfmt_misc.ko): Cannot allocate memory [ 18.063549] binfmt_misc: Failed to create fs/binfmt_misc sysctl mount point [ 18.204779] BUG: unable to handle page fault for address: fffffbfff8004802 Link: https://lkml.kernel.org/r/20220124181812.1869535-2-ztong0001@gmail.com Fixes: 3ba442d5331f ("fs: move binfmt_misc sysctl to its own file") Signed-off-by: Tong Zhang Co-developed-by: Christian Brauner Cc: Eric Biederman Cc: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_misc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ddea6acbddde5..c07f35719ee3f 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -817,20 +817,20 @@ static struct file_system_type bm_fs_type = { }; MODULE_ALIAS_FS("binfmt_misc"); +static struct ctl_table_header *binfmt_misc_header; + static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); - if (!register_sysctl_mount_point("fs/binfmt_misc")) { - pr_warn("Failed to create fs/binfmt_misc sysctl mount point"); - return -ENOMEM; - } + binfmt_misc_header = register_sysctl_mount_point("fs/binfmt_misc"); return 0; } static void __exit exit_misc_binfmt(void) { + unregister_sysctl_table(binfmt_misc_header); unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } From 7215153677f9a2e8ad13d3ce6c30c98c4ea0a162 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Jan 2022 16:25:38 +1100 Subject: [PATCH 003/154] ia64: make IA64_MCA_RECOVERY bool instead of tristate In linux-next, IA64_MCA_RECOVERY uses the (new) function make_task_dead(), which is not exported for use by modules. Instead of exporting it for one user, convert IA64_MCA_RECOVERY to be a bool Kconfig symbol. In a config file from "kernel test robot " for a different problem, this linker error was exposed when CONFIG_IA64_MCA_RECOVERY=m. Fixes this build error: ERROR: modpost: "make_task_dead" [arch/ia64/kernel/mca_recovery.ko] undefined! Link: https://lkml.kernel.org/r/20220124213129.29306-1-rdunlap@infradead.org Fixes: 0e25498f8cd4 ("exit: Add and use make_task_dead.") Signed-off-by: Randy Dunlap Suggested-by: Christoph Hellwig Cc: "Eric W. Biederman" Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 703952819e10e..a7e01573abd83 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -318,7 +318,7 @@ config ARCH_PROC_KCORE_TEXT depends on PROC_KCORE config IA64_MCA_RECOVERY - tristate "MCA recovery from errors other than TLB." + bool "MCA recovery from errors other than TLB." config IA64_PALINFO tristate "/proc/pal support" From 45c147b8117673edf02cf91f67f21110d86aa5fc Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 27 Jan 2022 16:25:38 +1100 Subject: [PATCH 004/154] memory-failure: fetch compound_head after pgmap_pfn_valid() memory_failure_dev_pagemap() at the moment assumes base pages (e.g. dax_lock_page()). For devmap with compound pages fetch the compound_head in case a tail page memory failure is being handled. Currently this is a nop, but in the advent of compound pages in dev_pagemap it allows memory_failure_dev_pagemap() to keep working. Link: https://lkml.kernel.org/r/20211202204422.26777-2-joao.m.martins@oracle.com Reported-by: Jane Chu Signed-off-by: Joao Martins Reviewed-by: Naoya Horiguchi Reviewed-by: Dan Williams Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 14ae5c18e7766..97a9ed8f87a96 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1595,6 +1595,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto out; } + /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by From 08268ce6965631213fd9087f67d4bf1c334f6867 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 27 Jan 2022 16:25:39 +1100 Subject: [PATCH 005/154] mm: page->mapping folio->mapping should have the same offset As with the other members of folio, the offset of page->mapping and folio->mapping must be the same. The compile-time check was inadvertently removed during development. Add it back. [willy@infradead.org: changelog redo] Link: https://lkml.kernel.org/r/20220104011734.21714-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm_types.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9db36dc5d4cf8..5140e5feb4866 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -261,6 +261,7 @@ static_assert(sizeof(struct page) == sizeof(struct folio)); static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); +FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(index, index); FOLIO_MATCH(private, private); From 6068aa71c80984a843871d2d435ebeefc24364db Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 27 Jan 2022 16:25:39 +1100 Subject: [PATCH 006/154] tools/testing/scatterlist: Add missing defines The cited commits replaced preemptible with pagefault_disabled and flush_kernel_dcache_page with flush_dcache_page respectively, hence need to update the corresponding defines in the test. Link: https://lkml.kernel.org/r/20220118082105.1737320-1-maorg@nvidia.com Fixes: 723aca208516 ("mm/scatterlist: replace the !preemptible warning in sg_miter_stop()") Fixes: 0e84f5dbf8d6 ("scatterlist: replace flush_kernel_dcache_page with flush_dcache_page") Signed-off-by: Maor Gottlieb Tested-by: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/scatterlist/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index 16ec895bbe5ff..5bd9e6e806254 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -74,7 +74,7 @@ static inline unsigned long page_to_phys(struct page *page) __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ x, y) -#define preemptible() (1) +#define pagefault_disabled() (0) static inline void *kmap(struct page *page) { @@ -127,6 +127,7 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags) #define kmemleak_free(a) #define PageSlab(p) (0) +#define flush_dcache_page(p) #define MAX_ERRNO 4095 From aa75aef7d13b5357b0dc4e79c7f94d884efc3652 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 27 Jan 2022 16:25:39 +1100 Subject: [PATCH 007/154] kasan: test: fix compatibility with FORTIFY_SOURCE With CONFIG_FORTIFY_SOURCE enabled, string functions will also perform dynamic checks using __builtin_object_size(ptr), which when failed will panic the kernel. Because the KASAN test deliberately performs out-of-bounds operations, the kernel panics with FORTIFY_SOURCE, for example: | kernel BUG at lib/string_helpers.c:910! | invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI | CPU: 1 PID: 137 Comm: kunit_try_catch Tainted: G B 5.16.0-rc3+ #3 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 | RIP: 0010:fortify_panic+0x19/0x1b | ... | Call Trace: | | kmalloc_oob_in_memset.cold+0x16/0x16 | ... Fix it by also hiding `ptr` from the optimizer, which will ensure that __builtin_object_size() does not return a valid size, preventing fortified string functions from panicking. Link: https://lkml.kernel.org/r/20220124160744.1244685-1-elver@google.com Signed-off-by: Marco Elver Reported-by: Nico Pache Reviewed-by: Nico Pache Reviewed-by: Andrey Konovalov Reviewed-by: Kees Cook Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Brendan Higgins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 847cdbefab461..26a5c9007653a 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -492,6 +492,7 @@ static void kmalloc_oob_in_memset(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + KASAN_GRANULE_SIZE)); @@ -515,6 +516,7 @@ static void kmalloc_memmove_negative_size(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); @@ -531,6 +533,7 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); @@ -893,6 +896,7 @@ static void kasan_memchr(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = memchr(ptr, '1', size + 1)); @@ -919,6 +923,7 @@ static void kasan_memcmp(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset(arr, 0, sizeof(arr)); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = memcmp(ptr, arr, size+1)); From e75b66508e336390291ca8160ce4ec195c6955f5 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 27 Jan 2022 16:25:39 +1100 Subject: [PATCH 008/154] mm, kasan: use compare-exchange operation to set KASAN page tag It has been reported that the tag setting operation on newly-allocated pages can cause the page flags to be corrupted when performed concurrently with other flag updates as a result of the use of non-atomic operations. Fix the problem by using a compare-exchange loop to update the tag. Link: https://lkml.kernel.org/r/20220120020148.1632253-1-pcc@google.com Link: https://linux-review.googlesource.com/id/I456b24a2b9067d93968d43b4bb3351c0cec63101 Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Peter Collingbourne Cc: Andrey Konovalov Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e1a84b1e6787e..213cc569b1922 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,11 +1506,18 @@ static inline u8 page_kasan_tag(const struct page *page) static inline void page_kasan_tag_set(struct page *page, u8 tag) { - if (kasan_enabled()) { - tag ^= 0xff; - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } + unsigned long old_flags, flags; + + if (!kasan_enabled()) + return; + + tag ^= 0xff; + old_flags = READ_ONCE(page->flags); + do { + flags = old_flags; + flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) From 167dea2c7e0aab2ad588a2a1fb6476ca92b948c8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 27 Jan 2022 16:25:40 +1100 Subject: [PATCH 009/154] psi: fix "no previous prototype" warnings when CONFIG_CGROUPS=n When CONFIG_CGROUPS is disabled psi code generates the following warnings: kernel/sched/psi.c:1112:21: warning: no previous prototype for 'psi_trigger_create' [-Wmissing-prototypes] 1112 | struct psi_trigger *psi_trigger_create(struct psi_group *group, | ^~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1182:6: warning: no previous prototype for 'psi_trigger_destroy' [-Wmissing-prototypes] 1182 | void psi_trigger_destroy(struct psi_trigger *t) | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1249:10: warning: no previous prototype for 'psi_trigger_poll' [-Wmissing-prototypes] 1249 | __poll_t psi_trigger_poll(void **trigger_ptr, | ^~~~~~~~~~~~~~~~ Change declarations of these functions in the header to provide the prototypes even when they are unused. Link: https://lkml.kernel.org/r/20220119223940.787748-2-surenb@google.com Fixes: 0e94682b73bf ("psi: introduce psi monitor") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/psi.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index f8ce53bfdb2a2..7f7d1d88c3bbd 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -25,18 +25,17 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); - -#ifdef CONFIG_CGROUPS -int psi_cgroup_alloc(struct cgroup *cgrp); -void psi_cgroup_free(struct cgroup *cgrp); -void cgroup_move_task(struct task_struct *p, struct css_set *to); - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); #endif #else /* CONFIG_PSI */ From 432237c8f1d18dbcf95a659b202d0a7c96f0a8d5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 27 Jan 2022 16:25:40 +1100 Subject: [PATCH 010/154] psi: fix "defined but not used" warnings when CONFIG_PROC_FS=n When CONFIG_PROC_FS is disabled psi code generates the following warnings: kernel/sched/psi.c:1364:30: warning: 'psi_cpu_proc_ops' defined but not used [-Wunused-const-variable=] 1364 | static const struct proc_ops psi_cpu_proc_ops = { | ^~~~~~~~~~~~~~~~ kernel/sched/psi.c:1355:30: warning: 'psi_memory_proc_ops' defined but not used [-Wunused-const-variable=] 1355 | static const struct proc_ops psi_memory_proc_ops = { | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1346:30: warning: 'psi_io_proc_ops' defined but not used [-Wunused-const-variable=] 1346 | static const struct proc_ops psi_io_proc_ops = { | ^~~~~~~~~~~~~~~ Make definitions of these structures and related functions conditional on CONFIG_PROC_FS config. Link: https://lkml.kernel.org/r/20220119223940.787748-3-surenb@google.com Fixes: 0e94682b73bf ("psi: introduce psi monitor") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/sched/psi.c | 79 ++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c137c4d6983ec..e143581788497 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1082,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1278,6 +1240,45 @@ __poll_t psi_trigger_poll(void **trigger_ptr, return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1392,3 +1393,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ From 74df67e444ec01fc09eb4b6432dfa23499d2eb11 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Thu, 27 Jan 2022 16:25:40 +1100 Subject: [PATCH 011/154] mm: fix panic in __alloc_pages There is a kernel panic caused by pcpu_alloc_pages() passing offlined and uninitialized node to alloc_pages_node() leading to panic by NULL dereferencing uninitialized NODE_DATA(nid). CPU2 has been hot-added BUG: unable to handle page fault for address: 0000000000001608 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 1 Comm: systemd Tainted: G E 5.15.0-rc7+ #11 Hardware name: VMware, Inc. VMware7,1/440BX Desktop Reference Platform, BIOS VMW RIP: 0010:__alloc_pages+0x127/0x290 Code: 4c 89 f0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 44 89 e0 48 8b 55 b8 c1 e8 0c 83 e0 01 88 45 d0 4c 89 c8 48 85 d2 0f 85 1a 01 00 00 <45> 3b 41 08 0f 82 10 01 00 00 48 89 45 c0 48 8b 00 44 89 e2 81 e2 RSP: 0018:ffffc900006f3bc8 EFLAGS: 00010246 RAX: 0000000000001600 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000cc2 RBP: ffffc900006f3c18 R08: 0000000000000001 R09: 0000000000001600 R10: ffffc900006f3a40 R11: ffff88813c9fffe8 R12: 0000000000000cc2 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000cc2 FS: 00007f27ead70500(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000001608 CR3: 000000000582c003 CR4: 00000000001706b0 Call Trace: pcpu_alloc_pages.constprop.0+0xe4/0x1c0 pcpu_populate_chunk+0x33/0xb0 pcpu_alloc+0x4d3/0x6f0 __alloc_percpu_gfp+0xd/0x10 alloc_mem_cgroup_per_node_info+0x54/0xb0 mem_cgroup_alloc+0xed/0x2f0 mem_cgroup_css_alloc+0x33/0x2f0 css_create+0x3a/0x1f0 cgroup_apply_control_enable+0x12b/0x150 cgroup_mkdir+0xdd/0x110 kernfs_iop_mkdir+0x4f/0x80 vfs_mkdir+0x178/0x230 do_mkdirat+0xfd/0x120 __x64_sys_mkdir+0x47/0x70 ? syscall_exit_to_user_mode+0x21/0x50 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Panic can be easily reproduced by disabling udev rule for automatic onlining hot added CPU followed by CPU with memoryless node (NUMA node with CPU only) hot add. Hot adding CPU and memoryless node does not bring the node to online state. Memoryless node will be onlined only during the onlining its CPU. Node can be in one of the following states: 1. not present.(nid == NUMA_NO_NODE) 2. present, but offline (nid > NUMA_NO_NODE, node_online(nid) == 0, NODE_DATA(nid) == NULL) 3. present and online (nid > NUMA_NO_NODE, node_online(nid) > 0, NODE_DATA(nid) != NULL) Percpu code is doing allocations for all possible CPUs. The issue happens when it serves hot added but not yet onlined CPU when its node is in 2nd state. This node is not ready to use, fallback to numa_mem_id(). Link: https://lkml.kernel.org/r/20211108202325.20304-1-amakhalov@vmware.com Signed-off-by: Alexey Makhalov Reviewed-by: David Hildenbrand Acked-by: Dennis Zhou Cc: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Cc: Tejun Heo Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/percpu-vm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 2054c9213c433..f58d73c927892 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -84,15 +84,19 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, gfp_t gfp) { unsigned int cpu, tcpu; - int i; + int i, nid; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { + nid = cpu_to_node(cpu); + if (nid == NUMA_NO_NODE || !node_online(nid)) + nid = numa_mem_id(); + for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; - *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + *pagep = alloc_pages_node(nid, gfp, 0); if (!*pagep) goto err; } From a9aefa14dd0bfff1be260369b2171b53872c72d2 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 27 Jan 2022 16:25:40 +1100 Subject: [PATCH 012/154] jbd2: export jbd2_journal_[grab|put]_journal_head Patch series "ocfs2: fix a deadlock case". This fixes a deadlock case in ocfs2. We firstly export jbd2 symbols jbd2_journal_[grab|put]_journal_head as preparation and later use them in ocfs2 insread of jbd_[lock|unlock]_bh_journal_head to fix the deadlock. This patch (of 2): This exports symbols jbd2_journal_[grab|put]_journal_head, which will be used outside modules, e.g. ocfs2. Link: https://lkml.kernel.org/r/20220121071205.100648-2-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Andreas Dilger Cc: Gautham Ananthakrishna Cc: Saeed Mirzamohammadi Cc: "Theodore Ts'o" Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f13d548e4a7fd..bf108d4493d21 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2972,6 +2972,7 @@ struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) jbd_unlock_bh_journal_head(bh); return jh; } +EXPORT_SYMBOL(jbd2_journal_grab_journal_head); static void __journal_remove_journal_head(struct buffer_head *bh) { @@ -3024,6 +3025,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_unlock_bh_journal_head(bh); } } +EXPORT_SYMBOL(jbd2_journal_put_journal_head); /* * Initialize jbd inode head From 6546667cb3d98cf4ec7ceaf19ceb650ca6cb22e6 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 27 Jan 2022 16:25:41 +1100 Subject: [PATCH 013/154] ocfs2: fix a deadlock when commit trans commit 6f1b228529ae introduces a regression which can deadlock as follows: Task1: Task2: jbd2_journal_commit_transaction ocfs2_test_bg_bit_allocatable spin_lock(&jh->b_state_lock) jbd_lock_bh_journal_head __jbd2_journal_remove_checkpoint spin_lock(&jh->b_state_lock) jbd2_journal_put_journal_head jbd_lock_bh_journal_head Task1 and Task2 lock bh->b_state and jh->b_state_lock in different order, which finally result in a deadlock. So use jbd2_journal_[grab|put]_journal_head instead in ocfs2_test_bg_bit_allocatable() to fix it. Link: https://lkml.kernel.org/r/20220121071205.100648-3-joseph.qi@linux.alibaba.com Fixes: 6f1b228529ae ("ocfs2: fix race between searching chunks and release journal_head from buffer_head") Signed-off-by: Joseph Qi Reported-by: Gautham Ananthakrishna Reported-by: Saeed Mirzamohammadi Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/suballoc.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 481017e1dac5a..166c8918c825a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1251,26 +1251,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; - int ret = 1; + int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; - if (!buffer_jbd(bg_bh)) + jh = jbd2_journal_grab_journal_head(bg_bh); + if (!jh) return 1; - jbd_lock_bh_journal_head(bg_bh); - if (buffer_jbd(bg_bh)) { - jh = bh2jh(bg_bh); - spin_lock(&jh->b_state_lock); - bg = (struct ocfs2_group_desc *) jh->b_committed_data; - if (bg) - ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); - else - ret = 1; - spin_unlock(&jh->b_state_lock); - } - jbd_unlock_bh_journal_head(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); return ret; } From 6dc6ead0442ec548315b387fe67d92ae856e3f2c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 27 Jan 2022 16:25:41 +1100 Subject: [PATCH 014/154] mm/gup.c: fix invalid page pointer returned with FOLL_PIN gups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alex reported invalid page pointer returned with pin_user_pages_remote() from vfio after upstream commit 4b6c33b32296 ("vfio/type1: Prepare for batched pinning with struct vfio_batch"). This problem breaks NVIDIA vfio mdev. It turns out that it's not the fault of the vfio commit; however after vfio switches to a full page buffer to store the page pointers it starts to expose the problem easier. The problem is for VM_PFNMAP vmas we should normally fail with an -EFAULT then vfio will carry on to handle the MMIO regions. However when the bug triggered, follow_page_mask() returned -EEXIST for such a page, which will jump over the current page, leaving that entry in **pages untouched. However the caller is not aware of it, hence the caller will reference the page as usual even if the pointer data can be anything. We had that -EEXIST logic since commit 1027e4436b6a ("mm: make GUP handle pfn mapping unless FOLL_GET is requested") which seems very reasonable. It could be that when we reworked GUP with FOLL_PIN we could have overlooked that special path in commit 3faa52c03f44 ("mm/gup: track FOLL_PIN pages"), even if that commit rightfully touched up follow_devmap_pud() on checking FOLL_PIN when it needs to return an -EEXIST. Since at it, add another WARN_ON_ONCE() at the -EEXIST handling to make sure we mustn't have **pages set when reaching there, because otherwise it means the caller will try to read a garbage right after __get_user_pages() returns. Attaching the Fixes to the FOLL_PIN rework commit, as it happened later than 1027e4436b6a. Link: https://lkml.kernel.org/r/20220125033700.69705-1-peterx@redhat.com Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Signed-off-by: Peter Xu Reported-by: Alex Williamson Debugged-by: Alex Williamson Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Jan Kara Cc: Jérôme Glisse Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index f0af462ac1e2b..8ebc04058e972 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -440,7 +440,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { /* No page to get reference */ - if (flags & FOLL_GET) + if (flags & (FOLL_GET | FOLL_PIN)) return -EFAULT; if (flags & FOLL_TOUCH) { @@ -1181,7 +1181,13 @@ static long __get_user_pages(struct mm_struct *mm, /* * Proper page table entry exists, but no corresponding * struct page. + * + * Warn if we jumped over even with a valid **pages. + * It shouldn't trigger in practise, but when there's + * buggy returns on -EEXIST we'll warn before returning + * an invalid page pointer in the array. */ + WARN_ON_ONCE(pages); goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); From 3fc9f72fc9d69cebae9fd0de68137492728c8272 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 27 Jan 2022 16:25:41 +1100 Subject: [PATCH 015/154] fs/proc: task_mmu.c: don't read mapcount for migration entry syzbot reported the below BUG: kernel BUG at include/linux/page-flags.h:785! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00 RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003 RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001 FS: 00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: page_mapcount include/linux/mm.h:837 [inline] smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466 smaps_pte_entry fs/proc/task_mmu.c:538 [inline] smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379 walk_page_vma+0x277/0x350 mm/pagewalk.c:530 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768 smap_gather_stats fs/proc/task_mmu.c:741 [inline] show_smap+0xc6/0x440 fs/proc/task_mmu.c:822 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272 seq_read+0x3e0/0x5b0 fs/seq_file.c:162 vfs_read+0x1b5/0x600 fs/read_write.c:479 ksys_read+0x12d/0x250 fs/read_write.c:619 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7faa2af6c969 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969 RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000 R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000 Modules linked in: ---[ end trace 24ec93ff95e4ac3d ]--- RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00 RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003 RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001 FS: 00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 The reproducer was trying to reading /proc/$PID/smaps when calling MADV_FREE at the same time. MADV_FREE may split THPs if it is called for partial THP. It may trigger the below race: CPU A CPU B ----- ----- smaps walk: MADV_FREE: page_mapcount() PageCompound() split_huge_page() page = compound_head(page) PageDoubleMap(page) When calling PageDoubleMap() this page is not a tail page of THP anymore so the BUG is triggered. This could be fixed by elevated refcount of the page before calling mapcount, but it prevents from counting migration entries, and it seems overkilling because the race just could happen when PMD is split so all PTE entries of tail pages are actually migration entries, and smaps_account() does treat migration entries as mapcount == 1 as Kirill pointed out. Add a new parameter for smaps_account() to tell this entry is migration entry then skip calling page_mapcount(). Don't skip getting mapcount for device private entries since they do track references with mapcount. Link: https://lkml.kernel.org/r/20220120202805.3369-1-shy828301@gmail.com Fixes: b1d4d9e0cbd0 ("proc/smaps: carefully handle migration entries") Signed-off-by: Yang Shi Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com Cc: "Kirill A. Shutemov" Cc: Jann Horn Cc: Matthew Wilcox Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/task_mmu.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 18f8c3acbb85e..2bb567014d77b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, } static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty, bool locked) + bool compound, bool young, bool dirty, bool locked, + bool migration) { int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -467,8 +468,12 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). + * + * Treated regular migration entries as mapcount == 1 without reading + * mapcount since calling page_mapcount() for migration entries is + * racy against THP splitting. */ - if (page_count(page) == 1) { + if ((page_count(page) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; @@ -517,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); @@ -536,8 +542,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; page = pfn_swap_entry_to_page(swpent); + } } else { smaps_pte_hole_lookup(addr, walk); return; @@ -546,7 +555,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), + locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -557,6 +567,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pmd_present(*pmd)) { /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -564,8 +575,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) + if (is_migration_entry(entry)) { + migration = true; page = pfn_swap_entry_to_page(entry); + } } if (IS_ERR_OR_NULL(page)) return; @@ -577,7 +590,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, From 95b50010de45c154da39429ab34bd0873d3e79c7 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 27 Jan 2022 16:25:41 +1100 Subject: [PATCH 016/154] Revert "mm/page_isolation: unset migratetype directly for non Buddy page" This reverts commit 721fb891ad0b3956d5c168b2931e3e5e4fb7ca40. commit 721fb891ad0b ("mm/page_isolation: unset migratetype directly for non Buddy page") will result memory that should in buddy disappear by mistake. move_freepages_block move all pages in pageblock instead of pages indicated by input parameter, so if input pages is not in buddy but other pages in pageblock is in buddy, it will result in page out of control. Link: https://lkml.kernel.org/r/20220126024436.13921-1-chenwandun@huawei.com Fixes: 721fb891ad0b ("mm/page_isolation: unset migratetype directly for non Buddy page") Signed-off-by: Chen Wandun Reported-by: "kernelci.org bot" Acked-by: David Hildenbrand Tested-by: Dong Aisheng Tested-by: Francesco Dolcini Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6a0ddda6b3c53..f67c4c70f17f6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page && PageBuddy(page)) { + if (!isolated_page) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } From fc52d73958038e483b8058fc69cfbea241f55db2 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 27 Jan 2022 16:25:42 +1100 Subject: [PATCH 017/154] mm/debug_vm_pgtable: remove pte entry from the page table Patch series "page table check fixes and cleanups", v4. Two fixes: mm/debug_vm_pgtable: remove pte entry from the page table - remove a pte entry from the page table at the end of debug_vm_pgtable pte test mm/khugepaged: unify collapse pmd clear, flush and free mm/page_table_check: check entries at pmd levels - check pmd level in page_table_check for PTE regular entries prior to freeing. repro.c: https://gist.github.com/soleen/fdcd501d5df103976245fe84e9535087 config: https://gist.github.com/soleen/8a56f923c2fea9ce9c75b4e2517d4162 qemu_script: https://gist.github.com/soleen/f4be4795826b7ab1a51ae659582e179c base image: https://storage.googleapis.com/syzkaller/wheezy.img https://storage.googleapis.com/syzkaller/wheezy.img.key Small cleanup: mm/page_table_check: use unsigned long for page counters and cleanup This patch (of 4): The pte entry that is used in pte_advanced_tests() is never removed from the page table at the end of the test. The issue is detected by page_table_check, to repro compile kernel with the following configs: CONFIG_DEBUG_VM_PGTABLE=y CONFIG_PAGE_TABLE_CHECK=y CONFIG_PAGE_TABLE_CHECK_ENFORCED=y During the boot the following BUG is printed: [ 2.262821] debug_vm_pgtable: [debug_vm_pgtable ]: Validating architecture page table helpers [ 2.276826] ------------[ cut here ]------------ [ 2.280426] kernel BUG at mm/page_table_check.c:162! [ 2.284118] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 2.287787] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.16.0-11413-g2c271fe77d52 #3 [ 2.293226] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 ... The entry should be properly removed from the page table before the page is released to the free list. Link: https://lkml.kernel.org/r/20220126183637.1840960-2-pasha.tatashin@soleen.com Fixes: a5c3b9ffb0f4 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers") Signed-off-by: Pasha Tatashin Reviewed-by: Zi Yan Tested-by: Zi Yan Acked-by: David Rientjes Cc: Paul Turner Cc: Wei Xu Cc: Greg Thelen Cc: Ingo Molnar Cc: Will Deacon Cc: Mike Rapoport Cc: Dave Hansen Cc: H. Peter Anvin Cc: Aneesh Kumar K.V Cc: Jiri Slaby Cc: Muchun Song Cc: Hugh Dickins Cc: Anshuman Khandual Cc: [5.9+] Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug_vm_pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a7ac97c76762c..db2abd9e415b7 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -171,6 +171,8 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); + + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) From 98a350eaa350cf8ad7a3e9f71acfc6737eb6ef8e Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 27 Jan 2022 16:25:42 +1100 Subject: [PATCH 018/154] mm/page_table_check: use unsigned long for page counters and cleanup For consistency, use "unsigned long" for all page counters. Also, reduce code duplication by calling __page_table_check_*_clear() from __page_table_check_*_set() functions. Link: https://lkml.kernel.org/r/20220126183637.1840960-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Wei Xu Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_table_check.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 7504e7caa2a1a..c61d7ebe13b1e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -86,8 +86,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, { struct page_ext *page_ext; struct page *page; + unsigned long i; bool anon; - int i; if (!pfn_valid(pfn)) return; @@ -121,8 +121,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, { struct page_ext *page_ext; struct page *page; + unsigned long i; bool anon; - int i; if (!pfn_valid(pfn)) return; @@ -152,10 +152,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext *page_ext = lookup_page_ext(page); - int i; + unsigned long i; BUG_ON(!page_ext); - for (i = 0; i < (1 << order); i++) { + for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); @@ -206,17 +206,10 @@ EXPORT_SYMBOL(__page_table_check_pud_clear); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - pte_t old_pte; - if (&init_mm == mm) return; - old_pte = *ptep; - if (pte_user_accessible_page(old_pte)) { - page_table_check_clear(mm, addr, pte_pfn(old_pte), - PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pte_clear(mm, addr, *ptep); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -228,17 +221,10 @@ EXPORT_SYMBOL(__page_table_check_pte_set); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - pmd_t old_pmd; - if (&init_mm == mm) return; - old_pmd = *pmdp; - if (pmd_user_accessible_page(old_pmd)) { - page_table_check_clear(mm, addr, pmd_pfn(old_pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(mm, addr, pmd_pfn(pmd), PMD_PAGE_SIZE >> PAGE_SHIFT, @@ -250,17 +236,10 @@ EXPORT_SYMBOL(__page_table_check_pmd_set); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - pud_t old_pud; - if (&init_mm == mm) return; - old_pud = *pudp; - if (pud_user_accessible_page(old_pud)) { - page_table_check_clear(mm, addr, pud_pfn(old_pud), - PUD_PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(mm, addr, pud_pfn(pud), PUD_PAGE_SIZE >> PAGE_SHIFT, From 701fedb49f6310b916b007fc29b4ed754bf5b05e Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 27 Jan 2022 16:25:42 +1100 Subject: [PATCH 019/154] mm/khugepaged: unify collapse pmd clear, flush and free Unify the code that flushes, clears pmd entry, and frees the PTE table level into a new function collapse_and_free_pmd(). This clean-up is useful as in the next patch we will add another call to this function to iterate through PTE prior to freeing the level for page table check. Link: https://lkml.kernel.org/r/20220126183637.1840960-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/khugepaged.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 35f14d0a00a6c..30e59e4af2721 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1416,6 +1416,19 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return 0; } +static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + spinlock_t *ptl; + pmd_t pmd; + + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_collapse_flush(vma, addr, pmdp); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(pmd)); +} + /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. @@ -1433,7 +1446,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma = find_vma(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd, _pmd; + pmd_t *pmd; spinlock_t *ptl; int count = 0; int i; @@ -1509,12 +1522,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); - _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - + collapse_and_free_pmd(mm, vma, haddr, pmd); drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1552,7 +1560,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; - pmd_t *pmd, _pmd; + pmd_t *pmd; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1591,14 +1599,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * reverse order. Trylock is a way to avoid deadlock. */ if (mmap_write_trylock(mm)) { - if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); - /* assume page table is clear */ - _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - } + if (!khugepaged_test_exit(mm)) + collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { /* Try again later */ From b76b03526dd4ca3ded5d8f90d1610adb96bd2a6d Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 27 Jan 2022 16:25:42 +1100 Subject: [PATCH 020/154] mm/page_table_check: check entries at pmd levels syzbot detected a case where the page table counters were not properly updated. syzkaller login: ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4 RIP: 0010:__page_table_check_zero+0x159/0x1a0 Code: 7d 3a b2 ff 45 39 f5 74 2a e8 43 38 b2 ff 4d 85 e4 01 RSP: 0018:ffff888010667418 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000 RDX: ffff88800cea8680 RSI: ffffffff81becaf9 RDI: 0000000003 RBP: ffff888010667450 R08: 0000000000000001 R09: 0000000000 R10: ffffffff81becaab R11: 0000000000000001 R12: ffff888008 R13: 0000000000000001 R14: 0000000000000200 R15: dffffc0000 FS: 0000000000000000(0000) GS:ffff888035e00000(0000) knlG0 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd875cad00 CR3: 00000000094ce000 CR4: 0000000000 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000 Call Trace: free_pcp_prepare+0x3be/0xaa0 free_unref_page+0x1c/0x650 ? trace_hardirqs_on+0x6a/0x1d0 free_compound_page+0xec/0x130 free_transhuge_page+0x1be/0x260 __put_compound_page+0x90/0xd0 release_pages+0x54c/0x1060 ? filemap_remove_folio+0x161/0x210 ? lock_downgrade+0x720/0x720 ? __put_page+0x150/0x150 ? filemap_free_folio+0x164/0x350 __pagevec_release+0x7c/0x110 shmem_undo_range+0x85e/0x1250 ... The repro involved having a huge page that is split due to uprobe event temporarily replacing one of the pages in the huge page. Later the huge page was combined again, but the counters were off, as the PTE level was not properly updated. Make sure that when PMD is cleared and prior to freeing the level the PTEs are updated. Link: https://lkml.kernel.org/r/20220126183637.1840960-5-pasha.tatashin@soleen.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Rientjes Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/page_table_check.h | 18 ++++++++++++++++++ mm/khugepaged.c | 3 +++ mm/page_table_check.c | 21 +++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 38cace1da7b69..e88bbe37727b0 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -26,6 +26,8 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); +void __page_table_check_pmd_clear_full(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { @@ -100,6 +102,16 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, __page_table_check_pud_set(mm, addr, pudp, pud); } +static inline void page_table_check_pmd_clear_full(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_clear_full(mm, addr, pmd); +} + #else static inline void page_table_check_alloc(struct page *page, unsigned int order) @@ -143,5 +155,11 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, { } +static inline void page_table_check_pmd_clear_full(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ +} + #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 30e59e4af2721..d84977c6dc0dd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1422,10 +1423,12 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v spinlock_t *ptl; pmd_t pmd; + mmap_assert_write_locked(mm); ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp); spin_unlock(ptl); mm_dec_nr_ptes(mm); + page_table_check_pmd_clear_full(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index c61d7ebe13b1e..251f95a808b4f 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -247,3 +247,24 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, } } EXPORT_SYMBOL(__page_table_check_pud_set); + +void __page_table_check_pmd_clear_full(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { + pte_t *ptep = pte_offset_map(&pmd, addr); + unsigned long i; + + pte_unmap(ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + __page_table_check_pte_clear(mm, addr, *ptep); + addr += PAGE_SIZE; + ptep++; + } + } else { + __page_table_check_pmd_clear(mm, addr, pmd); + } +} From d7d3b81d5f7d02fa66bfbf36df15d85cef261ab2 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 27 Jan 2022 16:25:43 +1100 Subject: [PATCH 021/154] coredump: also dump first pages of non-executable ELF libraries When I rewrote the VMA dumping logic for coredumps, I changed it to recognize ELF library mappings based on the file being executable instead of the mapping having an ELF header. But turns out, distros ship many ELF libraries as non-executable, so the heuristic goes wrong... Restore the old behavior where FILTER(ELF_HEADERS) dumps the first page of any offset-0 readable mapping that starts with the ELF magic. This fix is technically layer-breaking a bit, because it checks for something ELF-specific in fs/coredump.c; but since we probably want to share this between standard ELF and FDPIC ELF anyway, I guess it's fine? And this also keeps the change small for backporting. Link: https://lkml.kernel.org/r/20220126025739.2014888-1-jannh@google.com Fixes: 429a22e776a2 ("coredump: rework elf/elf_fdpic vma_dump_size() into c= ommon helper") Signed-off-by: Jann Horn Reported-by: Bill Messmer Cc: "Eric W . Biederman" Cc: Al Viro Cc: Randy Dunlap Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/coredump.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 1c060c0a2d72f..b73817712dd25 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -980,6 +981,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) return false; } +#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 + /* * Decide how much of @vma's contents should be included in a core dump. */ @@ -1039,9 +1042,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * dump the first page to aid in determining what was mapped here. */ if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && - (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) - return PAGE_SIZE; + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + + /* + * ELF libraries aren't always executable. + * We'll want to check whether the mapping starts with the ELF + * magic, but not now - we're holding the mmap lock, + * so copy_from_user() doesn't work here. + * Use a placeholder instead, and fix it up later in + * dump_vma_snapshot(). + */ + return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; + } #undef FILTER @@ -1116,8 +1130,6 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); - - vma_data_size += m->dump_size; } mmap_write_unlock(mm); @@ -1127,6 +1139,23 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, return -EFAULT; } + for (i = 0; i < *vma_count; i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { + char elfmag[SELFMAG]; + + if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || + memcmp(elfmag, ELFMAG, SELFMAG) != 0) { + m->dump_size = 0; + } else { + m->dump_size = PAGE_SIZE; + } + } + + vma_data_size += m->dump_size; + } + *vma_data_size_ptr = vma_data_size; return 0; } From 79e1d419b000cd81b58b0eb61cc825a04b7b9f33 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 27 Jan 2022 16:25:43 +1100 Subject: [PATCH 022/154] mm/pgtable: define pte_index so that preprocessor could recognize it Since commit 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") pte_index is a static inline and there is no define for it that can be recognized by the preprocessor. As a result, vm_insert_pages() uses slower loop over vm_insert_page() instead of insert_pages() that amortizes the cost of spinlock operations when inserting multiple pages. Link: https://lkml.kernel.org/r/20220111145457.20748-1-rppt@kernel.org Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") Signed-off-by: Mike Rapoport Reported-by: Christian Dietrich Reviewed-by: Khalid Aziz Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/pgtable.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc8713a76e034..f4f4077b97aab 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -62,6 +62,7 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } +#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) From 4b207470fe8e42f61c08b3b9aebfd7f1c58d74e8 Mon Sep 17 00:00:00 2001 From: Toshiki Fukasawa Date: Thu, 27 Jan 2022 16:25:43 +1100 Subject: [PATCH 023/154] /proc/kpageflags: prevent an integer overflow in stable_page_flags() stable_page_flags() returns kpageflags info in u64, but it uses "1 << KPF_*" internally which is considered as int. This type mismatch causes no visible problem now, but it will if you set bit 32 or more as done in a subsequent patch. So use BIT_ULL in order to avoid future overflow issues. Link: http://lkml.kernel.org/r/20190725023100.31141-2-t-fukasawa@vx.jp.nec.com Signed-off-by: Toshiki Fukasawa Cc: Michal Hocko Cc: Dan Williams Cc: Alexey Dobriyan Cc: Christoph Hellwig Cc: Naoya Horiguchi Cc: Junichi Nomura Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde1..265f4fca15e29 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,7 @@ u64 stable_page_flags(struct page *page) * it differentiates a memory hole from a page with no flags */ if (!page) - return 1 << KPF_NOPAGE; + return BIT_ULL(KPF_NOPAGE); k = page->flags; u = 0; @@ -127,22 +127,22 @@ u64 stable_page_flags(struct page *page) * simple test in page_mapped() is not enough. */ if (!PageSlab(page) && page_mapped(page)) - u |= 1 << KPF_MMAP; + u |= BIT_ULL(KPF_MMAP); if (PageAnon(page)) - u |= 1 << KPF_ANON; + u |= BIT_ULL(KPF_ANON); if (PageKsm(page)) - u |= 1 << KPF_KSM; + u |= BIT_ULL(KPF_KSM); /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; + u |= BIT_ULL(KPF_COMPOUND_HEAD); if (PageTail(page)) - u |= 1 << KPF_COMPOUND_TAIL; + u |= BIT_ULL(KPF_COMPOUND_TAIL); if (PageHuge(page)) - u |= 1 << KPF_HUGE; + u |= BIT_ULL(KPF_HUGE); /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it @@ -153,14 +153,13 @@ u64 stable_page_flags(struct page *page) struct page *head = compound_head(page); if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_THP); else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_ZERO_PAGE); + u |= BIT_ULL(KPF_THP); } } else if (is_zero_pfn(page_to_pfn(page))) - u |= 1 << KPF_ZERO_PAGE; - + u |= BIT_ULL(KPF_ZERO_PAGE); /* * Caveats on high order pages: page->_refcount will only be set @@ -168,23 +167,23 @@ u64 stable_page_flags(struct page *page) * SLOB won't set PG_slab at all on compound pages. */ if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); else if (page_count(page) == 0 && is_free_buddy_page(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); if (PageOffline(page)) - u |= 1 << KPF_OFFLINE; + u |= BIT_ULL(KPF_OFFLINE); if (PageTable(page)) - u |= 1 << KPF_PGTABLE; + u |= BIT_ULL(KPF_PGTABLE); if (page_is_idle(page)) - u |= 1 << KPF_IDLE; + u |= BIT_ULL(KPF_IDLE); u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; + u |= BIT_ULL(KPF_SLAB); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); @@ -197,7 +196,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); if (PageSwapCache(page)) - u |= 1 << KPF_SWAPCACHE; + u |= BIT_ULL(KPF_SWAPCACHE); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); From 153695851e2a9e231ec2bd940bf7fa29e12a52e3 Mon Sep 17 00:00:00 2001 From: Toshiki Fukasawa Date: Thu, 27 Jan 2022 16:25:43 +1100 Subject: [PATCH 024/154] /proc/kpageflags: do not use uninitialized struct pages A kernel panic was observed during reading /proc/kpageflags for first few pfns allocated by pmem namespace: BUG: unable to handle page fault for address: fffffffffffffffe [ 114.495280] #PF: supervisor read access in kernel mode [ 114.495738] #PF: error_code(0x0000) - not-present page [ 114.496203] PGD 17120e067 P4D 17120e067 PUD 171210067 PMD 0 [ 114.496713] Oops: 0000 [#1] SMP PTI [ 114.497037] CPU: 9 PID: 1202 Comm: page-types Not tainted 5.3.0-rc1 #1 [ 114.497621] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014 [ 114.498706] RIP: 0010:stable_page_flags+0x27/0x3f0 [ 114.499142] Code: 82 66 90 66 66 66 66 90 48 85 ff 0f 84 d1 03 00 00 41 54 55 48 89 fd 53 48 8b 57 08 48 8b 1f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 02 0f 84 57 03 00 00 45 31 e4 48 8b 55 08 48 89 ef [ 114.500788] RSP: 0018:ffffa5e601a0fe60 EFLAGS: 00010202 [ 114.501373] RAX: fffffffffffffffe RBX: ffffffffffffffff RCX: 0000000000000000 [ 114.502009] RDX: 0000000000000001 RSI: 00007ffca13a7310 RDI: ffffd07489000000 [ 114.502637] RBP: ffffd07489000000 R08: 0000000000000001 R09: 0000000000000000 [ 114.503270] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000240000 [ 114.503896] R13: 0000000000080000 R14: 00007ffca13a7310 R15: ffffa5e601a0ff08 [ 114.504530] FS: 00007f0266c7f540(0000) GS:ffff962dbbac0000(0000) knlGS:0000000000000000 [ 114.505245] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 114.505754] CR2: fffffffffffffffe CR3: 000000023a204000 CR4: 00000000000006e0 [ 114.506401] Call Trace: [ 114.506660] kpageflags_read+0xb1/0x130 [ 114.507051] proc_reg_read+0x39/0x60 [ 114.507387] vfs_read+0x8a/0x140 [ 114.507686] ksys_pread64+0x61/0xa0 [ 114.508021] do_syscall_64+0x5f/0x1a0 [ 114.508372] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 114.508844] RIP: 0033:0x7f0266ba426b The reason for the panic is that stable_page_flags() which parses the page flags uses uninitialized struct pages reserved by the ZONE_DEVICE driver. Earlier approach to fix this was discussed here: https://marc.info/?l=linux-mm&m=152964770000672&w=2 This is another approach. To avoid using the uninitialized struct page, immediately return with KPF_RESERVED at the beginning of stable_page_flags() if the page is reserved by ZONE_DEVICE driver. Dan said: : The nvdimm implementation uses vmem_altmap to arrange for the 'struct : page' array to be allocated from a reservation of a pmem namespace. A : namespace in this mode contains an info-block that consumes the first : 8K of the namespace capacity, capacity designated for page mapping, : capacity for padding the start of data to optionally 4K, 2MB, or 1GB : (on x86), and then the namespace data itself. The implementation : specifies a section aligned (now sub-section aligned) address to : arch_add_memory() to establish the linear mapping to map the metadata, : and then vmem_altmap indicates to memmap_init_zone() which pfns : represent data. The implementation only specifies enough 'struct page' : capacity for pfn_to_page() to operate on the data space, not the : namespace metadata space. : : The proposal to validate ZONE_DEVICE pfns against the altmap seems the : right approach to me. Link: http://lkml.kernel.org/r/20190725023100.31141-3-t-fukasawa@vx.jp.nec.com Signed-off-by: Toshiki Fukasawa Cc: Alexey Dobriyan Cc: Christoph Hellwig Cc: Dan Williams Cc: Junichi Nomura Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 3 +++ include/linux/memremap.h | 6 ++++++ mm/memremap.c | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/proc/page.c b/fs/proc/page.c index 265f4fca15e29..4dcbcd506cb6e 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -117,6 +117,9 @@ u64 stable_page_flags(struct page *page) if (!page) return BIT_ULL(KPF_NOPAGE); + if (pfn_zone_device_reserved(page_to_pfn(page))) + return BIT_ULL(KPF_RESERVED); + k = page->flags; u = 0; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1fafcc38acbad..eea1b5cf25716 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -130,6 +130,7 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -142,6 +143,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else +static inline bool pfn_zone_device_reserved(unsigned long pfn) +{ + return false; +} + static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { diff --git a/mm/memremap.c b/mm/memremap.c index 6aa5f0c2d11fd..d2a72cf2ff831 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -115,6 +115,26 @@ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } +/* + * This returns true if the page is reserved by ZONE_DEVICE driver. + */ +bool pfn_zone_device_reserved(unsigned long pfn) +{ + struct dev_pagemap *pgmap; + struct vmem_altmap *altmap; + bool ret = false; + + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ret; + altmap = pgmap_altmap(pgmap); + if (altmap && pfn < (altmap->base_pfn + altmap->reserve)) + ret = true; + put_dev_pagemap(pgmap); + + return ret; +} + #define for_each_device_pfn(pfn, map, i) \ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ pfn = pfn_next(map, pfn)) From 4f80aaf537d31b63f9a312971e4544b4ab317781 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 27 Jan 2022 16:25:44 +1100 Subject: [PATCH 025/154] procfs: prevent unpriveleged processes accessing fdinfo dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The file permissions on the fdinfo dir from were changed from S_IRUSR|S_IXUSR to S_IRUGO|S_IXUGO, and a PTRACE_MODE_READ check was added for opening the fdinfo files [1]. However, the ptrace permission check was not added to the directory, allowing anyone to get the open FD numbers by reading the fdinfo directory. Add the missing ptrace permission check for opening the fdinfo directory. [1] https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com Link: https://lkml.kernel.org/r/20210713162008.1056986-1-kaleshsingh@google.com Fixes: 7bc3fa0172a4 ("procfs: allow reading fdinfo with PTRACE_MODE_READ") Signed-off-by: Kalesh Singh Cc: Kees Cook Cc: Eric W. Biederman Cc: Christian Brauner Cc: Christian König Cc: Suren Baghdasaryan Cc: Hridya Valsaraju Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/fd.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b312..913bef0d2a36c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ static int seq_show(struct seq_file *m, void *v) return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, From 52e5096724012b54f65e2f3df49451021024deab Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Thu, 27 Jan 2022 16:25:44 +1100 Subject: [PATCH 026/154] ntfs: add sanity check on allocation size ntfs_read_inode_mount invokes ntfs_malloc_nofs with zero allocation size. It triggers one BUG in the __ntfs_malloc function. Fix this by adding sanity check on ni->attr_list_size. Link: https://lkml.kernel.org/r/20220120094914.47736-1-dzm91@hust.edu.cn Reported-by: syzbot+3c765c5248797356edaa@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ntfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4474adb393ca8..517b71c73aa96 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " From 80344dfe34bccd55d726552f8a312c639856c992 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 27 Jan 2022 16:25:44 +1100 Subject: [PATCH 027/154] ocfs2: cleanup some return variables Simply return directly instead of assign the return value to another variable. Link: https://lkml.kernel.org/r/20220114021641.13927-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reported-by: Zeal Robot Cc: Minghao Chi Cc: CGEL ZTE Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/file.c | 9 +++------ fs/ocfs2/stack_user.c | 18 ++++++------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fc5f780fa2355..24321c44cd42e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int ret; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); - ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, - clusters_to_add, mark_unwritten, - data_ac, meta_ac, reason_ret); - - return ret; + return ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); } static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 85a47621e0c07..a75e2b7d67f56 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - int ret; - if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + sizeof(struct dlm_lksb); - ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, - flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, lksb, - fsdlm_blocking_ast_wrapper); - return ret; + return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - int ret; - - ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, lksb); - return ret; + return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); } static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) From b2ba5e19d762b9e4e545d96aa5156260e1a26f4a Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 27 Jan 2022 16:25:44 +1100 Subject: [PATCH 028/154] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea2..f8bbb22cc60ba 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7f6355cbb5875..a9a0c7c37e8ed 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c4..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From 1d293c2fac4092509dcd30cc6eea451e5ad98c6e Mon Sep 17 00:00:00 2001 From: Wangyan Date: Thu, 27 Jan 2022 16:25:45 +1100 Subject: [PATCH 029/154] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f8bbb22cc60ba..a5c975715aa05 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2027,8 +2033,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From bfd348bfd9ea08e6b6585ad34233f0f549b51bbe Mon Sep 17 00:00:00 2001 From: Wangyan Date: Thu, 27 Jan 2022 16:25:45 +1100 Subject: [PATCH 030/154] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a5c975715aa05..0939186f010e3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -640,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 91ac7fcace9ea883d7c123b12752b05ec2b81282 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Thu, 27 Jan 2022 16:25:45 +1100 Subject: [PATCH 031/154] mount: warn only once about timestamp range expiration Commit f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry") introduced a mount warning regarding filesystem timestamp limits, that is printed upon each writable mount or remount. This can result in a lot of unnecessary messages in the kernel log in setups where filesystems are being frequently remounted (or mounted multiple times). Avoid this by setting a superblock flag which indicates that the warning has been emitted at least once for any particular mount, as suggested in [1]. [1] https://lore.kernel.org/CAHk-=wim6VGnxQmjfK_tDg6fbHYKL4EFkmnTjVr9QnRqjDBAeA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220119202934.26495-1-ailiop@suse.com Signed-off-by: Anthony Iliopoulos Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Darrick J. Wong Cc: Alexander Viro Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namespace.c | 2 ++ include/linux/fs.h | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 40b994a29e90d..a090cf92e5057 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2567,6 +2567,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2581,6 +2582,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } diff --git a/include/linux/fs.h b/include/linux/fs.h index f3daaea165548..5c537cd9b0060 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1435,6 +1435,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ /* Possible states of 'frozen' field */ enum { From 64748cf67a6b75dde881e282529dbf6a22979786 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:45 +1100 Subject: [PATCH 032/154] kasan, page_alloc: deduplicate should_skip_kasan_poison Patch series "kasan, vmalloc, arm64: add vmalloc tagging support for SW/HW_TAGS", v6. This patchset adds vmalloc tagging support for SW_TAGS and HW_TAGS KASAN modes. About half of patches are cleanups I went for along the way. None of them seem to be important enough to go through stable, so I decided not to split them out into separate patches/series. The patchset is partially based on an early version of the HW_TAGS patchset by Vincenzo that had vmalloc support. Thus, I added a Co-developed-by tag into a few patches. SW_TAGS vmalloc tagging support is straightforward. It reuses all of the generic KASAN machinery, but uses shadow memory to store tags instead of magic values. Naturally, vmalloc tagging requires adding a few kasan_reset_tag() annotations to the vmalloc code. HW_TAGS vmalloc tagging support stands out. HW_TAGS KASAN is based on Arm MTE, which can only assigns tags to physical memory. As a result, HW_TAGS KASAN only tags vmalloc() allocations, which are backed by page_alloc memory. It ignores vmap() and others. This patch (of 39): Currently, should_skip_kasan_poison() has two definitions: one for when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, one for when it's not. Instead of duplicating the checks, add a deferred_pages_enabled() helper and use it in a single should_skip_kasan_poison() definition. Also move should_skip_kasan_poison() closer to its caller and clarify all conditions in the comment. Link: https://lkml.kernel.org/r/cover.1643047180.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/658b79f5fb305edaf7dc16bc52ea870d3220d4a8.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Peter Collingbourne Cc: Evgenii Stepanov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 55 +++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d319..25d4f9ad35258 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1271,6 +1253,35 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; } +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} + static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) { int i; From 9e21137d76ac5292868f31c50f843fd923e92212 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:46 +1100 Subject: [PATCH 033/154] kasan, page_alloc: move tag_clear_highpage out of kernel_init_free_pages Currently, kernel_init_free_pages() serves two purposes: it either only zeroes memory or zeroes both memory and memory tags via a different code path. As this function has only two callers, each using only one code path, this behaviour is confusing. Pull the code that zeroes both memory and tags out of kernel_init_free_pages(). As a result of this change, the code in free_pages_prepare() starts to look complicated, but this is improved in the few following patches. Those improvements are not integrated into this patch to make diffs easier to read. This patch does no functional changes. Link: https://lkml.kernel.org/r/7719874e68b23902629c7cf19f966c4fd5f57979.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25d4f9ad35258..012170b1c47aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1282,16 +1282,10 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) PageSkipKASanPoison(page); } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +static void kernel_init_free_pages(struct page *page, int numpages) { int i; - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { @@ -1387,7 +1381,7 @@ static __always_inline bool free_pages_prepare(struct page *page, bool init = want_init_on_free(); if (init) - kernel_init_free_pages(page, 1 << order, false); + kernel_init_free_pages(page, 1 << order); if (!skip_kasan_poison) kasan_poison_pages(page, order, init); } @@ -2430,9 +2424,17 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + + if (init) { + if (gfp_flags & __GFP_ZEROTAGS) { + int i; + + for (i = 0; i < 1 << order; i++) + tag_clear_highpage(page + i); + } else { + kernel_init_free_pages(page, 1 << order); + } + } } set_page_owner(page, order, gfp_flags); From c0a6431afea7568de0a427aca260388e6fe5a833 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:46 +1100 Subject: [PATCH 034/154] kasan, page_alloc: merge kasan_free_pages into free_pages_prepare Currently, the code responsible for initializing and poisoning memory in free_pages_prepare() is scattered across two locations: kasan_free_pages() for HW_TAGS KASAN and free_pages_prepare() itself. This is confusing. This and a few following patches combine the code from these two locations. Along the way, these patches also simplify the performed checks to make them easier to follow. Replaces the only caller of kasan_free_pages() with its implementation. As kasan_has_integrated_init() is only true when CONFIG_KASAN_HW_TAGS is enabled, moving the code does no functional changes. This patch is not useful by itself but makes the simplifications in the following patches easier to follow. Link: https://lkml.kernel.org/r/303498d15840bb71905852955c6e2390ecc87139.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 8 -------- mm/kasan/common.c | 2 +- mm/kasan/hw_tags.c | 11 ----------- mm/page_alloc.c | 6 ++++-- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4a45562d88937..a8bfe9f157c9c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,7 +96,6 @@ static inline bool kasan_hw_tags_enabled(void) } void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); -void kasan_free_pages(struct page *page, unsigned int order); #else /* CONFIG_KASAN_HW_TAGS */ @@ -117,13 +116,6 @@ static __always_inline void kasan_alloc_pages(struct page *page, BUILD_BUG(); } -static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 92196562687b6..a0082fad48b12 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) } /* - * The object will be poisoned by kasan_free_pages() or + * The object will be poisoned by kasan_poison_pages() or * kasan_slab_free_mempool(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 7355cb534e4f8..0b8225add2e48 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -213,17 +213,6 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) } } -void kasan_free_pages(struct page *page, unsigned int order) -{ - /* - * This condition should match the one in free_pages_prepare() in - * page_alloc.c. - */ - bool init = want_init_on_free(); - - kasan_poison_pages(page, order, init); -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 012170b1c47aa..e5f95c6ab0ac6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1368,15 +1368,17 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ if (kasan_has_integrated_init()) { + bool init = want_init_on_free(); + if (!skip_kasan_poison) - kasan_free_pages(page, order); + kasan_poison_pages(page, order, init); } else { bool init = want_init_on_free(); From 542a82f0513d6a81b166cf435eb0a3cfa13f5b4b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:46 +1100 Subject: [PATCH 035/154] kasan, page_alloc: simplify kasan_poison_pages call site Simplify the code around calling kasan_poison_pages() in free_pages_prepare(). This patch does no functional changes. Link: https://lkml.kernel.org/r/ae4f9bcf071577258e786bcec4798c145d718c46.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5f95c6ab0ac6..60bc838a4d853 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1302,6 +1302,7 @@ static __always_inline bool free_pages_prepare(struct page *page, { int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1374,19 +1375,10 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - bool init = want_init_on_free(); - - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); - } else { - bool init = want_init_on_free(); - - if (init) - kernel_init_free_pages(page, 1 << order); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); - } + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + if (!skip_kasan_poison) + kasan_poison_pages(page, order, init); /* * arch_free_page() can make the page's contents inaccessible. s390 From 198c90d52d550749028048dc6a8315d4ea21237e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:46 +1100 Subject: [PATCH 036/154] kasan, page_alloc: init memory of skipped pages on free Since commit 7a3b83537188 ("kasan: use separate (un)poison implementation for integrated init"), when all init, kasan_has_integrated_init(), and skip_kasan_poison are true, free_pages_prepare() doesn't initialize the page. This is wrong. Fix it by remembering whether kasan_poison_pages() performed initialization, and call kernel_init_free_pages() if it didn't. Reordering kasan_poison_pages() and kernel_init_free_pages() is OK, since kernel_init_free_pages() can handle poisoned memory. Link: https://lkml.kernel.org/r/1d97df75955e52727a3dc1c4e33b3b50506fc3fd.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 60bc838a4d853..f994fd68e3b11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1375,11 +1375,16 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (init && !kasan_has_integrated_init()) - kernel_init_free_pages(page, 1 << order); - if (!skip_kasan_poison) + if (!skip_kasan_poison) { kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; + } + if (init) + kernel_init_free_pages(page, 1 << order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should From 630c01634a88213c9c0ff552d99e7a31cc49e0e8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:47 +1100 Subject: [PATCH 037/154] kasan: drop skip_kasan_poison variable in free_pages_prepare skip_kasan_poison is only used in a single place. Call should_skip_kasan_poison() directly for simplicity. Link: https://lkml.kernel.org/r/1d33212e79bc9ef0b4d3863f903875823e89046f.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f994fd68e3b11..8481420d25021 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1301,7 +1301,6 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1375,7 +1374,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (!skip_kasan_poison) { + if (!should_skip_kasan_poison(page, fpi_flags)) { kasan_poison_pages(page, order, init); /* Memory is already initialized if KASAN did it internally. */ From 7023db5ba49e559dac67152bf95f8264941928d0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:47 +1100 Subject: [PATCH 038/154] mm: clarify __GFP_ZEROTAGS comment __GFP_ZEROTAGS is intended as an optimization: if memory is zeroed during allocation, it's possible to set memory tags at the same time with little performance impact. Clarify this intention of __GFP_ZEROTAGS in the comment. Link: https://lkml.kernel.org/r/cdffde013973c5634a447513e10ec0d21e8eee29.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 80f63c862be57..581a1f47b8a2c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -232,8 +232,10 @@ struct vm_area_struct; * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if - * __GFP_ZERO is set. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself + * is being zeroed (either via __GFP_ZERO or via init_on_alloc). This flag is + * intended for optimization: setting memory tags at the same time as zeroing + * memory has minimal additional performace impact. * * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned * on deallocation. Typically used for userspace pages. Currently only has an From 51aac65a6ef286cb38e25283da2b18dcea2aa991 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:47 +1100 Subject: [PATCH 039/154] kasan: only apply __GFP_ZEROTAGS when memory is zeroed __GFP_ZEROTAGS should only be effective if memory is being zeroed. Currently, hardware tag-based KASAN violates this requirement. Fix by including an initialization check along with checking for __GFP_ZEROTAGS. Link: https://lkml.kernel.org/r/f4f4593f7f675262d29d07c1938db5bd0cd5e285.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 0b8225add2e48..c643740b85996 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -199,11 +199,12 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) * page_alloc.c. */ bool init = !want_init_on_free() && want_init_on_alloc(flags); + bool init_tags = init && (flags & __GFP_ZEROTAGS); if (flags & __GFP_SKIP_KASAN_POISON) SetPageSkipKASanPoison(page); - if (flags & __GFP_ZEROTAGS) { + if (init_tags) { int i; for (i = 0; i != 1 << order; ++i) From 1bea1e02b2a4beacf658b2b6bcdec73a5d62bcc7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:47 +1100 Subject: [PATCH 040/154] kasan, page_alloc: refactor init checks in post_alloc_hook Separate code for zeroing memory from the code clearing tags in post_alloc_hook(). This patch is not useful by itself but makes the simplifications in the following patches easier to follow. This patch does no functional changes. Link: https://lkml.kernel.org/r/2283fde963adfd8a2b29a92066f106cc16661a3c.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8481420d25021..868480d463c78 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2420,19 +2420,21 @@ inline void post_alloc_hook(struct page *page, unsigned int order, kasan_alloc_pages(page, order, gfp_flags); } else { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); kasan_unpoison_pages(page, order, init); - if (init) { - if (gfp_flags & __GFP_ZEROTAGS) { - int i; + if (init_tags) { + int i; - for (i = 0; i < 1 << order; i++) - tag_clear_highpage(page + i); - } else { - kernel_init_free_pages(page, 1 << order); - } + for (i = 0; i < 1 << order; i++) + tag_clear_highpage(page + i); + + init = false; } + + if (init) + kernel_init_free_pages(page, 1 << order); } set_page_owner(page, order, gfp_flags); From 2a10554bf7e83f0613511f6781afe03cebab5a6c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:48 +1100 Subject: [PATCH 041/154] kasan, page_alloc: merge kasan_alloc_pages into post_alloc_hook Currently, the code responsible for initializing and poisoning memory in post_alloc_hook() is scattered across two locations: kasan_alloc_pages() hook for HW_TAGS KASAN and post_alloc_hook() itself. This is confusing. This and a few following patches combine the code from these two locations. Along the way, these patches do a step-by-step restructure the many performed checks to make them easier to follow. Replace the only caller of kasan_alloc_pages() with its implementation. As kasan_has_integrated_init() is only true when CONFIG_KASAN_HW_TAGS is enabled, moving the code does no functional changes. Also move init and init_tags variables definitions out of kasan_has_integrated_init() clause in post_alloc_hook(), as they have the same values regardless of what the if condition evaluates to. This patch is not useful by itself but makes the simplifications in the following patches easier to follow. Link: https://lkml.kernel.org/r/5ac7e0b30f5cbb177ec363ddd7878a3141289592.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 9 --------- mm/kasan/common.c | 2 +- mm/kasan/hw_tags.c | 22 ---------------------- mm/page_alloc.c | 20 +++++++++++++++----- 4 files changed, 16 insertions(+), 37 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a8bfe9f157c9c..b88ca6b97ba32 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -95,8 +95,6 @@ static inline bool kasan_hw_tags_enabled(void) return kasan_enabled(); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); - #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_enabled(void) @@ -109,13 +107,6 @@ static inline bool kasan_hw_tags_enabled(void) return false; } -static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order, gfp_t flags) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index a0082fad48b12..d9079ec11f313 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, return NULL; /* - * The object has already been unpoisoned by kasan_alloc_pages() for + * The object has already been unpoisoned by kasan_unpoison_pages() for * alloc_pages() or by kasan_krealloc() for krealloc(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index c643740b85996..76cf2b6229c79 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -192,28 +192,6 @@ void __init kasan_init_hw_tags(void) kasan_stack_collection_enabled() ? "on" : "off"); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) -{ - /* - * This condition should match the one in post_alloc_hook() in - * page_alloc.c. - */ - bool init = !want_init_on_free() && want_init_on_alloc(flags); - bool init_tags = init && (flags & __GFP_ZEROTAGS); - - if (flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); - - if (init_tags) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { - kasan_unpoison_pages(page, order, init); - } -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 868480d463c78..abed862d889d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2398,6 +2398,9 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + set_page_private(page, 0); set_page_refcounted(page); @@ -2413,15 +2416,22 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + if (gfp_flags & __GFP_SKIP_KASAN_POISON) + SetPageSkipKASanPoison(page); + + if (init_tags) { + int i; + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + } else { + kasan_unpoison_pages(page, order, init); + } + } else { kasan_unpoison_pages(page, order, init); if (init_tags) { From c7f82eae8866516a18dea090f28dc8b34897f4ff Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:48 +1100 Subject: [PATCH 042/154] kasan, page_alloc: combine tag_clear_highpage calls in post_alloc_hook Move tag_clear_highpage() loops out of the kasan_has_integrated_init() clause as a code simplification. This patch does no functional changes. Link: https://lkml.kernel.org/r/587e3fc36358b88049320a89cc8dc6deaecb0cda.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index abed862d889d1..b3959327e06ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2419,30 +2419,30 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ + + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + int i; + + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } if (kasan_has_integrated_init()) { if (gfp_flags & __GFP_SKIP_KASAN_POISON) SetPageSkipKASanPoison(page); - if (init_tags) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { + if (!init_tags) kasan_unpoison_pages(page, order, init); - } } else { kasan_unpoison_pages(page, order, init); - if (init_tags) { - int i; - - for (i = 0; i < 1 << order; i++) - tag_clear_highpage(page + i); - - init = false; - } - if (init) kernel_init_free_pages(page, 1 << order); } From 6d7b9788ae1c95c255eb816c28b679827c9c3a2c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:48 +1100 Subject: [PATCH 043/154] kasan, page_alloc: move SetPageSkipKASanPoison in post_alloc_hook Pull the SetPageSkipKASanPoison() call in post_alloc_hook() out of the big if clause for better code readability. This also allows for more simplifications in the following patches. Also turn the kasan_has_integrated_init() check into the proper kasan_hw_tags_enabled() one. These checks evaluate to the same value, but logically skipping kasan poisoning has nothing to do with integrated init. Link: https://lkml.kernel.org/r/7214c1698b754ccfaa44a792113c95cc1f807c48.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b3959327e06ce..c51d637cdab39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2435,9 +2435,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, init = false; } if (kasan_has_integrated_init()) { - if (gfp_flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); - if (!init_tags) kasan_unpoison_pages(page, order, init); } else { @@ -2446,6 +2443,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, if (init) kernel_init_free_pages(page, 1 << order); } + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); From 2d10ffc24ab7eced993e40f832cf04f29513eb1a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:49 +1100 Subject: [PATCH 044/154] kasan, page_alloc: move kernel_init_free_pages in post_alloc_hook Pull the kernel_init_free_pages() call in post_alloc_hook() out of the big if clause for better code readability. This also allows for more simplifications in the following patch. This patch does no functional changes. Link: https://lkml.kernel.org/r/a7a76456501eb37ddf9fca6529cee9555e59cdb1.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c51d637cdab39..2784bd4789423 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2435,14 +2435,18 @@ inline void post_alloc_hook(struct page *page, unsigned int order, init = false; } if (kasan_has_integrated_init()) { - if (!init_tags) + if (!init_tags) { kasan_unpoison_pages(page, order, init); + + /* Note that memory is already initialized by KASAN. */ + init = false; + } } else { kasan_unpoison_pages(page, order, init); - - if (init) - kernel_init_free_pages(page, 1 << order); } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_free_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) SetPageSkipKASanPoison(page); From 671b9d0898763a961facd587543927edc263d512 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:49 +1100 Subject: [PATCH 045/154] kasan, page_alloc: rework kasan_unpoison_pages call site Rework the checks around kasan_unpoison_pages() call in post_alloc_hook(). The logical condition for calling this function is: - If a software KASAN mode is enabled, we need to mark shadow memory. - Otherwise, HW_TAGS KASAN is enabled, and it only makes sense to set tags if they haven't already been cleared by tag_clear_highpage(), which is indicated by init_tags. This patch concludes the changes for post_alloc_hook(). Link: https://lkml.kernel.org/r/0ecebd0d7ccd79150e3620ea4185a32d3dfe912f.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2784bd4789423..3af38e3233914 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2434,15 +2434,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - if (kasan_has_integrated_init()) { - if (!init_tags) { - kasan_unpoison_pages(page, order, init); + /* + * If either a software KASAN mode is enabled, or, + * in the case of hardware tag-based KASAN, + * if memory tags have not been cleared via tag_clear_highpage(). + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS) || + kasan_hw_tags_enabled() && !init_tags) { + /* Mark shadow memory or set memory tags. */ + kasan_unpoison_pages(page, order, init); - /* Note that memory is already initialized by KASAN. */ + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) init = false; - } - } else { - kasan_unpoison_pages(page, order, init); } /* If memory is still not initialized, do it now. */ if (init) From 09e576f67ae74def1a6726b2573160c59cd31535 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:49 +1100 Subject: [PATCH 046/154] kasan: clean up metadata byte definitions Most of the metadata byte values are only used for Generic KASAN. Remove KASAN_KMALLOC_FREETRACK definition for !CONFIG_KASAN_GENERIC case, and put it along with other metadata values for the Generic mode under a corresponding ifdef. Link: https://lkml.kernel.org/r/ac11d6e9e007c95e472e8fdd22efb6074ef3c6d8.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/kasan.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c17fa8d26ffe5..952cd6f9ca464 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -71,15 +71,16 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID #endif +#ifdef CONFIG_KASAN_GENERIC + +#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ #define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ @@ -110,6 +111,8 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_ABI_VERSION 1 #endif +#endif /* CONFIG_KASAN_GENERIC */ + /* Metadata layout customization. */ #define META_BYTES_PER_BLOCK 1 #define META_BLOCKS_PER_ROW 16 From 7b79c26fd76d8c7ee02fd55b51e2f21c3e073147 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:49 +1100 Subject: [PATCH 047/154] kasan: define KASAN_VMALLOC_INVALID for SW_TAGS In preparation for adding vmalloc support to SW_TAGS KASAN, provide a KASAN_VMALLOC_INVALID definition for it. HW_TAGS KASAN won't be using this value, as it falls back onto page_alloc for poisoning freed vmalloc() memory. Link: https://lkml.kernel.org/r/1daaaafeb148a7ae8285265edc97d7ca07b6a07d.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/kasan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 952cd6f9ca464..020f3e57a03f5 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -71,18 +71,19 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */ #endif #ifdef CONFIG_KASAN_GENERIC #define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ /* * Stack redzone shadow values From c7073f59d25ed0f9eca0a15e7a232b92d720bfbf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:50 +1100 Subject: [PATCH 048/154] kasan, x86, arm64, s390: rename functions for modules shadow Rename kasan_free_shadow to kasan_free_module_shadow and kasan_module_alloc to kasan_alloc_module_shadow. These functions are used to allocate/free shadow memory for kernel modules when KASAN_VMALLOC is not enabled. The new names better reflect their purpose. Also reword the comment next to their declaration to improve clarity. Link: https://lkml.kernel.org/r/36db32bde765d5d0b856f77d2d806e838513fe84.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kernel/module.c | 2 +- arch/s390/kernel/module.c | 2 +- arch/x86/kernel/module.c | 2 +- include/linux/kasan.h | 14 +++++++------- mm/kasan/shadow.c | 4 ++-- mm/vmalloc.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 309a27553c875..d3a1fa8183487 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index d52d85367bf73..b16bebd9a8b9f 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -45,7 +45,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 95fa745e310a5..c9eb8aa3b7b8a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -78,7 +78,7 @@ void *module_alloc(unsigned long size) MODULES_END, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b88ca6b97ba32..55f1d4edf6b55 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -454,17 +454,17 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, !defined(CONFIG_KASAN_VMALLOC) /* - * These functions provide a special case to support backing module - * allocations with real shadow memory. With KASAN vmalloc, the special - * case is unnecessary, as the work is handled in the generic case. + * These functions allocate and free shadow memory for kernel modules. + * They are only required when KASAN_VMALLOC is not supported, as otherwise + * shadow memory is allocated by the generic vmalloc handlers. */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); -void kasan_free_shadow(const struct vm_struct *vm); +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); +void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } -static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } +static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 94136f84b4497..e5c4393eb861e 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -534,7 +534,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) return -ENOMEM; } -void kasan_free_shadow(const struct vm_struct *vm) +void kasan_free_module_shadow(const struct vm_struct *vm) { if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4165304d35471..b6712a25c996e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2526,7 +2526,7 @@ struct vm_struct *remove_vm_area(const void *addr) va->vm = NULL; spin_unlock(&vmap_area_lock); - kasan_free_shadow(vm); + kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; From 9315a646b156aa70d2453f14c56f9901c18c4142 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:50 +1100 Subject: [PATCH 049/154] kasan, vmalloc: drop outdated VM_KASAN comment The comment about VM_KASAN in include/linux/vmalloc.c is outdated. VM_KASAN is currently only used to mark vm_areas allocated for kernel modules when CONFIG_KASAN_VMALLOC is disabled. Drop the comment. Link: https://lkml.kernel.org/r/780395afea83a147b3b5acc36cf2e38f7f8479f9.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/vmalloc.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 880227b9f0440..87f8cfec50a03 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,17 +35,6 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif -/* - * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. - * - * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after - * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poison shadow on free if it was never allocated. - * - * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to - * determine which allocations need the module shadow freed. - */ - /* bits [20..32] reserved for arch specific ioremap internals */ /* From 6f933f941311711352b2925d61cd7d2c714cb716 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:50 +1100 Subject: [PATCH 050/154] kasan: reorder vmalloc hooks Group functions that [de]populate shadow memory for vmalloc. Group functions that [un]poison memory for vmalloc. This patch does no functional changes but prepares KASAN code for adding vmalloc support to HW_TAGS KASAN. Link: https://lkml.kernel.org/r/aeef49eb249c206c4c9acce2437728068da74c28.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 20 +++++++++----------- mm/kasan/shadow.c | 43 ++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 55f1d4edf6b55..46a63374c86fb 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -418,34 +418,32 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); +void kasan_unpoison_vmalloc(const void *start, unsigned long size); +void kasan_poison_vmalloc(const void *start, unsigned long size); #else /* CONFIG_KASAN_VMALLOC */ +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } - -static inline void kasan_poison_vmalloc(const void *start, unsigned long size) -{ } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) {} + unsigned long free_region_end) { } -static inline void kasan_populate_early_vm_area_shadow(void *start, - unsigned long size) +static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ } +static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } #endif /* CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e5c4393eb861e..bf7ab62fbfb94 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; } -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison(start, size, false); -} - static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -496,6 +475,28 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } + +void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + kasan_unpoison(start, size, false); +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + #else /* CONFIG_KASAN_VMALLOC */ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) From 229f54a9a70a62e5e1cc4c52fae578113519a547 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:50 +1100 Subject: [PATCH 051/154] kasan: add wrappers for vmalloc hooks Add wrappers around functions that [un]poison memory for vmalloc allocations. These functions will be used by HW_TAGS KASAN and therefore need to be disabled when kasan=off command line argument is provided. This patch does no functional changes for software KASAN modes. Link: https://lkml.kernel.org/r/3b8728eac438c55389fb0f9a8a2145d71dd77487.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 17 +++++++++++++++-- mm/kasan/shadow.c | 5 ++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 46a63374c86fb..da320069e7cf8 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -424,8 +424,21 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); +void __kasan_unpoison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_unpoison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_unpoison_vmalloc(start, size); +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_poison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_poison_vmalloc(start, size); +} #else /* CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index bf7ab62fbfb94..39d0b32ebf708 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,8 +475,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) +void __kasan_unpoison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) return; @@ -488,7 +487,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) * Poison the shadow for a vmalloc region. Called as part of the * freeing process at the time the region is freed. */ -void kasan_poison_vmalloc(const void *start, unsigned long size) +void __kasan_poison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) return; From 2ff0ed08854b60edaff2ee5ddaf036c85eb18de7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:51 +1100 Subject: [PATCH 052/154] kasan, vmalloc: reset tags in vmalloc functions In preparation for adding vmalloc support to SW/HW_TAGS KASAN, reset pointer tags in functions that use pointer values in range checks. vread() is a special case here. Despite the untagging of the addr pointer in its prologue, the accesses performed by vread() are checked. Instead of accessing the virtual mappings though addr directly, vread() recovers the physical address via page_address(vmalloc_to_page()) and acceses that. And as page_address() recovers the pointer tag, the accesses get checked. Link: https://lkml.kernel.org/r/046003c5f683cacb0ba18e1079e9688bb3dca943.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b6712a25c996e..38bf3b418b816 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false; bool is_vmalloc_addr(const void *x) { - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } @@ -632,7 +632,7 @@ int is_vmalloc_or_module_addr(const void *x) * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif @@ -806,6 +806,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *tmp; @@ -827,6 +829,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *va; @@ -2145,7 +2149,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; - unsigned long addr = (unsigned long)mem; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); @@ -3404,6 +3408,8 @@ long vread(char *buf, char *addr, unsigned long count) unsigned long buflen = count; unsigned long n; + addr = kasan_reset_tag(addr); + /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; From 9d2dae85d689202c56068ce62e20821ad91c3606 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:51 +1100 Subject: [PATCH 053/154] kasan, fork: reset pointer tags of vmapped stacks Once tag-based KASAN modes start tagging vmalloc() allocations, kernel stacks start getting tagged if CONFIG_VMAP_STACK is enabled. Reset the tag of kernel stack pointers after allocation in alloc_thread_stack_node(). For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation can't handle the SP register being tagged. For HW_TAGS KASAN, there's no instrumentation-related issues. However, the impact of having a tagged SP register needs to be properly evaluated, so keep it non-tagged for now. Note, that the memory for the stack allocation still gets tagged to catch vmalloc-into-stack out-of-bounds accesses. Link: https://lkml.kernel.org/r/c6c96f012371ecd80e1936509ebcd3b07a5956f7.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b219..57d624f05182e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -254,6 +254,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) * so cache the vm_struct. */ if (stack) { + stack = kasan_reset_tag(stack); tsk->stack_vm_area = find_vm_area(stack); tsk->stack = stack; } From 54f7bbc3786b67625b02fe067a91478fc3f39caa Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:51 +1100 Subject: [PATCH 054/154] kasan, arm64: reset pointer tags of vmapped stacks Once tag-based KASAN modes start tagging vmalloc() allocations, kernel stacks start getting tagged if CONFIG_VMAP_STACK is enabled. Reset the tag of kernel stack pointers after allocation in arch_alloc_vmap_stack(). For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation can't handle the SP register being tagged. For HW_TAGS KASAN, there's no instrumentation-related issues. However, the impact of having a tagged SP register needs to be properly evaluated, so keep it non-tagged for now. Note, that the memory for the stack allocation still gets tagged to catch vmalloc-into-stack out-of-bounds accesses. Link: https://lkml.kernel.org/r/698c5ab21743c796d46c15d075b9481825973e34.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/include/asm/vmap_stack.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 894e031b28d28..20873099c035c 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -17,10 +17,13 @@ */ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { + void *p; + BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, __builtin_return_address(0)); + return kasan_reset_tag(p); } #endif /* __ASM_VMAP_STACK_H */ From 93555972485ebcac55b3855205bf154f1ba8478f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:51 +1100 Subject: [PATCH 055/154] kasan, vmalloc: add vmalloc tagging for SW_TAGS Add vmalloc tagging support to SW_TAGS KASAN. - __kasan_unpoison_vmalloc() now assigns a random pointer tag, poisons the virtual mapping accordingly, and embeds the tag into the returned pointer. - __get_vm_area_node() (used by vmalloc() and vmap()) and pcpu_get_vm_areas() save the tagged pointer into vm_struct->addr (note: not into vmap_area->addr). This requires putting kasan_unpoison_vmalloc() after setup_vmalloc_vm[_locked](); otherwise the latter will overwrite the tagged pointer. The tagged pointer then is naturally propagateed to vmalloc() and vmap(). - vm_map_ram() returns the tagged pointer directly. As a result of this change, vm_struct->addr is now tagged. Enabling KASAN_VMALLOC with SW_TAGS is not yet allowed. Link: https://lkml.kernel.org/r/4a78f3c064ce905e9070c29733aca1dd254a74f1.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 16 ++++++++++------ mm/kasan/shadow.c | 6 ++++-- mm/vmalloc.c | 14 ++++++++------ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index da320069e7cf8..92c5dfa29a352 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -424,12 +424,13 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void __kasan_unpoison_vmalloc(const void *start, unsigned long size); -static __always_inline void kasan_unpoison_vmalloc(const void *start, - unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size); +static __always_inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size) { if (kasan_enabled()) - __kasan_unpoison_vmalloc(start, size); + return __kasan_unpoison_vmalloc(start, size); + return (void *)start; } void __kasan_poison_vmalloc(const void *start, unsigned long size); @@ -454,8 +455,11 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_start, unsigned long free_region_end) { } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } +static inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size) +{ + return (void *)start; +} static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 39d0b32ebf708..5a866f6663fc0 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,12 +475,14 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } -void __kasan_unpoison_vmalloc(const void *start, unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) - return; + return (void *)start; + start = set_tag(start, kasan_random_tag()); kasan_unpoison(start, size, false); + return (void *)start; } /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 38bf3b418b816..15e1a4fdfe0b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2210,7 +2210,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - kasan_unpoison_vmalloc(mem, size); + mem = kasan_unpoison_vmalloc(mem, size); if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { @@ -2443,10 +2443,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - setup_vmalloc_vm(area, va, flags, caller); + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + return area; } @@ -3795,9 +3795,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; - - kasan_unpoison_vmalloc((void *)vas[area]->va_start, - sizes[area]); } /* insert all vm's */ @@ -3810,6 +3807,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); + /* mark allocated areas as accessible */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size); + kfree(vas); return vms; From df70c9aebdba46948cf5af35a82c23a8e8376797 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:52 +1100 Subject: [PATCH 056/154] kasan, vmalloc, arm64: mark vmalloc mappings as pgprot_tagged HW_TAGS KASAN relies on ARM Memory Tagging Extension (MTE). With MTE, a memory region must be mapped as MT_NORMAL_TAGGED to allow setting memory tags via MTE-specific instructions. Add proper protection bits to vmalloc() allocations. These allocations are always backed by page_alloc pages, so the tags will actually be getting set on the corresponding physical memory. Link: https://lkml.kernel.org/r/983fc33542db2f6b1e77b34ca23448d4640bbb9e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/include/asm/vmalloc.h | 6 ++++++ include/linux/vmalloc.h | 7 +++++++ mm/vmalloc.c | 9 +++++++++ 3 files changed, 22 insertions(+) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index b9185503feae2..38fafffe699f7 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -25,4 +25,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) #endif +#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return pgprot_tagged(prot); +} + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 87f8cfec50a03..7b879c77bec5f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) } #endif +#ifndef arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return prot; +} +#endif + /* * Highlevel APIs for driver use */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 15e1a4fdfe0b6..92e635b7490cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3108,6 +3108,15 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } + /* + * Modify protection bits to allow tagging. + * This must be done before mapping by __vmalloc_area_node(). + */ + if (kasan_hw_tags_enabled() && + pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) + prot = arch_vmap_pgprot_tagged(prot); + + /* Allocate physical pages and map them into vmalloc space. */ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) goto fail; From c86dc782f534bf1cf696d63e23993f0d1b0f04f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:52 +1100 Subject: [PATCH 057/154] kasan, vmalloc: unpoison VM_ALLOC pages after mapping Make KASAN unpoison vmalloc mappings after they have been mapped in when it's possible: for vmalloc() (indentified via VM_ALLOC) and vm_map_ram(). The reasons for this are: - For vmalloc() and vm_map_ram(): pages don't get unpoisoned in case mapping them fails. - For vmalloc(): HW_TAGS KASAN needs pages to be mapped to set tags via kasan_unpoison_vmalloc(). As a part of these changes, the return value of __vmalloc_node_range() is changed to area->addr. This is a non-functional change, as __vmalloc_area_node() returns area->addr anyway. Link: https://lkml.kernel.org/r/fcb98980e6fcd3c4be6acdcb5d6110898ef28548.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 92e635b7490cb..b65adac1cd802 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2210,14 +2210,15 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - mem = kasan_unpoison_vmalloc(mem, size); - if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + /* Mark the pages as accessible, now that they are mapped. */ + mem = kasan_unpoison_vmalloc(mem, size); + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2445,7 +2446,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, setup_vmalloc_vm(area, va, flags, caller); - area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); return area; } @@ -3055,7 +3063,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3117,10 +3125,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, prot = arch_vmap_pgprot_tagged(prot); /* Allocate physical pages and map them into vmalloc space. */ - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; + /* Mark the pages as accessible, now that they are mapped. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size); + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3132,7 +3143,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: if (shift > PAGE_SHIFT) { @@ -3816,7 +3827,10 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); - /* mark allocated areas as accessible */ + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size); From b3963a3ecc9b53c788f7f5d542110d0fd350967d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:52 +1100 Subject: [PATCH 058/154] kasan, mm: only define ___GFP_SKIP_KASAN_POISON with HW_TAGS Only define the ___GFP_SKIP_KASAN_POISON flag when CONFIG_KASAN_HW_TAGS is enabled. This patch it not useful by itself, but it prepares the code for additions of new KASAN-specific GFP patches. Link: https://lkml.kernel.org/r/44e5738a584c11801b2b8f1231898918efc8634a.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 8 +++++++- include/trace/events/mmflags.h | 12 +++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 581a1f47b8a2c..96f707931770c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -54,7 +54,11 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u +#ifdef CONFIG_KASAN_HW_TAGS #define ___GFP_SKIP_KASAN_POISON 0x1000000u +#else +#define ___GFP_SKIP_KASAN_POISON 0 +#endif #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x2000000u #else @@ -251,7 +255,9 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (24 + \ + IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 116ed4d5d0f88..cb4520374e2c8 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -49,12 +49,18 @@ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"}, \ - {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\ + {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ + +#ifdef CONFIG_KASAN_HW_TAGS +#define __def_gfpflag_names_kasan \ + , {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"} +#else +#define __def_gfpflag_names_kasan +#endif #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ - __def_gfpflag_names \ + __def_gfpflag_names __def_gfpflag_names_kasan \ ) : "none" #ifdef CONFIG_MMU From 9a47b06547a9c28d2899b27888bc006422d29554 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:52 +1100 Subject: [PATCH 059/154] kasan, page_alloc: allow skipping unpoisoning for HW_TAGS Add a new GFP flag __GFP_SKIP_KASAN_UNPOISON that allows skipping KASAN poisoning for page_alloc allocations. The flag is only effective with HW_TAGS KASAN. This flag will be used by vmalloc code for page_alloc allocations backing vmalloc() mappings in a following patch. The reason to skip KASAN poisoning for these pages in page_alloc is because vmalloc code will be poisoning them instead. Also reword the comment for __GFP_SKIP_KASAN_POISON. Link: https://lkml.kernel.org/r/35c97d77a704f6ff971dd3bfe4be95855744108e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 21 +++++++++++++-------- include/trace/events/mmflags.h | 5 +++-- mm/page_alloc.c | 31 ++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 96f707931770c..7303d1064460b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,12 +55,14 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_KASAN_POISON 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x1000000u +#define ___GFP_SKIP_KASAN_POISON 0x2000000u #else +#define ___GFP_SKIP_KASAN_UNPOISON 0 #define ___GFP_SKIP_KASAN_POISON 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x4000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -241,22 +243,25 @@ struct vm_area_struct; * intended for optimization: setting memory tags at the same time as zeroing * memory has minimal additional performace impact. * - * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned - * on deallocation. Typically used for userspace pages. Currently only has an - * effect in HW tags mode. + * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. + * Only effective in HW_TAGS mode. + * + * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation. + * Typically, used for userspace pages. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) -#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) +#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) +#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (24 + \ - IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ +#define __GFP_BITS_SHIFT (24 + \ + 2 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index cb4520374e2c8..134c45e62d918 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -52,8 +52,9 @@ {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ #ifdef CONFIG_KASAN_HW_TAGS -#define __def_gfpflag_names_kasan \ - , {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"} +#define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ + {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else #define __def_gfpflag_names_kasan #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3af38e3233914..94bfbc216ae9e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2395,6 +2395,26 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if either: + * + * 1. Memory tags have already been cleared via tag_clear_highpage(). + * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -2434,15 +2454,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - /* - * If either a software KASAN mode is enabled, or, - * in the case of hardware tag-based KASAN, - * if memory tags have not been cleared via tag_clear_highpage(). - */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || - IS_ENABLED(CONFIG_KASAN_SW_TAGS) || - kasan_hw_tags_enabled() && !init_tags) { - /* Mark shadow memory or set memory tags. */ + if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); /* Note that memory is already initialized by KASAN. */ From db88e21f5cce8c45f7973a272c3bd60440f0e1b2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:53 +1100 Subject: [PATCH 060/154] kasan, page_alloc: allow skipping memory init for HW_TAGS Add a new GFP flag __GFP_SKIP_ZERO that allows to skip memory initialization. The flag is only effective with HW_TAGS KASAN. This flag will be used by vmalloc code for page_alloc allocations backing vmalloc() mappings in a following patch. The reason to skip memory initialization for these pages in page_alloc is because vmalloc code will be initializing them instead. With the current implementation, when __GFP_SKIP_ZERO is provided, __GFP_ZEROTAGS is ignored. This doesn't matter, as these two flags are never provided at the same time. However, if this is changed in the future, this particular implementation detail can be changed as well. Link: https://lkml.kernel.org/r/0d53efeff345de7d708e0baa0d8829167772521e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 18 +++++++++++------- include/trace/events/mmflags.h | 1 + mm/page_alloc.c | 13 ++++++++++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7303d1064460b..7797c915ce54c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,14 +55,16 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_KASAN_UNPOISON 0x1000000u -#define ___GFP_SKIP_KASAN_POISON 0x2000000u +#define ___GFP_SKIP_ZERO 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u +#define ___GFP_SKIP_KASAN_POISON 0x4000000u #else +#define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN_UNPOISON 0 #define ___GFP_SKIP_KASAN_POISON 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP 0x8000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -239,9 +241,10 @@ struct vm_area_struct; * %__GFP_ZERO returns a zeroed page on success. * * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc). This flag is - * intended for optimization: setting memory tags at the same time as zeroing - * memory has minimal additional performace impact. + * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that + * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting + * memory tags at the same time as zeroing memory has minimal additional + * performace impact. * * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. * Only effective in HW_TAGS mode. @@ -253,6 +256,7 @@ struct vm_area_struct; #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) +#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) #define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) #define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) @@ -261,7 +265,7 @@ struct vm_area_struct; /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (24 + \ - 2 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + 3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 134c45e62d918..6532119a6bf1a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -53,6 +53,7 @@ #ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94bfbc216ae9e..368c6c5bf42a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2415,10 +2415,21 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); } +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); set_page_private(page, 0); From c9a950bcf1d67298187050bc3179096e4ef248c1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:53 +1100 Subject: [PATCH 061/154] kasan, vmalloc: add vmalloc tagging for HW_TAGS Add vmalloc tagging support to HW_TAGS KASAN. The key difference between HW_TAGS and the other two KASAN modes when it comes to vmalloc: HW_TAGS KASAN can only assign tags to physical memory. The other two modes have shadow memory covering every mapped virtual memory region. Make __kasan_unpoison_vmalloc() for HW_TAGS KASAN: - Skip non-VM_ALLOC mappings as HW_TAGS KASAN can only tag a single mapping of normal physical memory; see the comment in the function. - Generate a random tag, tag the returned pointer and the allocation, and initialize the allocation at the same time. - Propagate the tag into the page stucts to allow accesses through page_address(vmalloc_to_page()). The rest of vmalloc-related KASAN hooks are not needed: - The shadow-related ones are fully skipped. - __kasan_poison_vmalloc() is kept as a no-op with a comment. Poisoning and zeroing of physical pages that are backing vmalloc() allocations are skipped via __GFP_SKIP_KASAN_UNPOISON and __GFP_SKIP_ZERO: __kasan_unpoison_vmalloc() does that instead. Enabling CONFIG_KASAN_VMALLOC with HW_TAGS is not yet allowed. Link: https://lkml.kernel.org/r/d19b2e9e59a9abc59d05b72dea8429dcaea739c6.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 36 +++++++++++++++-- kernel/scs.c | 4 +- mm/kasan/hw_tags.c | 92 +++++++++++++++++++++++++++++++++++++++++++ mm/kasan/shadow.c | 10 ++++- mm/vmalloc.c | 51 ++++++++++++++++++------ 5 files changed, 175 insertions(+), 18 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 92c5dfa29a352..499f1573dba4c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -25,6 +25,12 @@ struct kunit_kasan_expectation { #endif +typedef unsigned int __bitwise kasan_vmalloc_flags_t; + +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -418,18 +424,39 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void *__kasan_unpoison_vmalloc(const void *start, unsigned long size); +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } +static inline int kasan_populate_vmalloc(unsigned long start, + unsigned long size) +{ + return 0; +} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) { } + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags); static __always_inline void *kasan_unpoison_vmalloc(const void *start, - unsigned long size) + unsigned long size, + kasan_vmalloc_flags_t flags) { if (kasan_enabled()) - return __kasan_unpoison_vmalloc(start, size); + return __kasan_unpoison_vmalloc(start, size, flags); return (void *)start; } @@ -456,7 +483,8 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_end) { } static inline void *kasan_unpoison_vmalloc(const void *start, - unsigned long size) + unsigned long size, + kasan_vmalloc_flags_t flags) { return (void *)start; } diff --git a/kernel/scs.c b/kernel/scs.c index 579841be88646..b83bc9251f996 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,7 +32,7 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); memset(s, 0, SCS_SIZE); return s; } @@ -78,7 +78,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); vfree_atomic(s); } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 76cf2b6229c79..21104fd518727 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -192,6 +192,98 @@ void __init kasan_init_hw_tags(void) kasan_stack_collection_enabled() ? "on" : "off"); } +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) +{ + struct vm_struct *area; + int i; + + /* + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. + */ + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + page_kasan_tag_set(page, tag); + } +} + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC + * mappings as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + return (void *)start; + + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + + /* + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). + */ + unpoison_vmalloc_pages(start, tag); + + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + /* + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. + */ +} + +#endif + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 5a866f6663fc0..b958babc8feda 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,8 +475,16 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } -void *__kasan_unpoison_vmalloc(const void *start, unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) { + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + if (!is_vmalloc_or_module_addr(start)) return (void *)start; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b65adac1cd802..6dcdf815576b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2216,8 +2216,12 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) return NULL; } - /* Mark the pages as accessible, now that they are mapped. */ - mem = kasan_unpoison_vmalloc(mem, size); + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_NONE); return mem; } @@ -2451,9 +2455,12 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) - area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_NONE); return area; } @@ -3064,6 +3071,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, { struct vm_struct *area; void *ret; + kasan_vmalloc_flags_t kasan_flags; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3116,21 +3124,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - /* - * Modify protection bits to allow tagging. - * This must be done before mapping by __vmalloc_area_node(). - */ + /* Prepare arguments for __vmalloc_area_node(). */ if (kasan_hw_tags_enabled() && - pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) + pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping in __vmalloc_area_node(). + */ prot = arch_vmap_pgprot_tagged(prot); + /* + * Skip page_alloc poisoning and zeroing for physical pages + * backing VM_ALLOC mapping. Memory is instead poisoned and + * zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; - /* Mark the pages as accessible, now that they are mapped. */ - area->addr = kasan_unpoison_vmalloc(area->addr, real_size); + /* + * Mark the pages as accessible, now that they are mapped. + * The init condition should match the one in post_alloc_hook() + * (except for the should_skip_init() check) to make sure that memory + * is initialized under the same conditions regardless of the enabled + * KASAN mode. + */ + kasan_flags = KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + kasan_flags |= KASAN_VMALLOC_INIT; + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -3830,10 +3856,13 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size); + vms[area]->size, + KASAN_VMALLOC_NONE); kfree(vas); return vms; From 831af5e7f050e2c4cc0aa1989753d14e6361cae7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:53 +1100 Subject: [PATCH 062/154] kasan, vmalloc: only tag normal vmalloc allocations The kernel can use to allocate executable memory. The only supported way to do that is via __vmalloc_node_range() with the executable bit set in the prot argument. (vmap() resets the bit via pgprot_nx()). Once tag-based KASAN modes start tagging vmalloc allocations, executing code from such allocations will lead to the PC register getting a tag, which is not tolerated by the kernel. Only tag the allocations for normal kernel pages. Link: https://lkml.kernel.org/r/fbfd9939a4dc375923c9a5c6b9e7ab05c26b8c6b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 7 ++++--- mm/kasan/hw_tags.c | 7 +++++++ mm/kasan/shadow.c | 7 +++++++ mm/vmalloc.c | 49 +++++++++++++++++++++++++------------------ 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 499f1573dba4c..3593c95d1fa54 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -27,9 +27,10 @@ struct kunit_kasan_expectation { typedef unsigned int __bitwise kasan_vmalloc_flags_t; -#define KASAN_VMALLOC_NONE 0x00u -#define KASAN_VMALLOC_INIT 0x01u -#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_PROT_NORMAL 0x04u #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 21104fd518727..2e9378a4f07f1 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -247,6 +247,13 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, if (!(flags & KASAN_VMALLOC_VM_ALLOC)) return (void *)start; + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + tag = kasan_random_tag(); start = set_tag(start, tag); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index b958babc8feda..7272e248db87d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -488,6 +488,13 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, if (!is_vmalloc_or_module_addr(start)) return (void *)start; + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + start = set_tag(start, kasan_random_tag()); kasan_unpoison(start, size, false); return (void *)start; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dcdf815576b3..375b53fd939f1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2221,7 +2221,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ - mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_NONE); + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } @@ -2460,7 +2460,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, - KASAN_VMALLOC_NONE); + KASAN_VMALLOC_PROT_NORMAL); return area; } @@ -3071,7 +3071,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, { struct vm_struct *area; void *ret; - kasan_vmalloc_flags_t kasan_flags; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3124,21 +3124,28 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - /* Prepare arguments for __vmalloc_area_node(). */ - if (kasan_hw_tags_enabled() && - pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - /* - * Modify protection bits to allow tagging. - * This must be done before mapping in __vmalloc_area_node(). - */ - prot = arch_vmap_pgprot_tagged(prot); + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); - /* - * Skip page_alloc poisoning and zeroing for physical pages - * backing VM_ALLOC mapping. Memory is instead poisoned and - * zeroed by kasan_unpoison_vmalloc(). - */ - gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ @@ -3152,10 +3159,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, * (except for the should_skip_init() check) to make sure that memory * is initialized under the same conditions regardless of the enabled * KASAN mode. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). */ - kasan_flags = KASAN_VMALLOC_VM_ALLOC; + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* @@ -3861,8 +3871,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size, - KASAN_VMALLOC_NONE); + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; From c9944678742d7377382423d84b7883cc163e663f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:53 +1100 Subject: [PATCH 063/154] kasan, arm64: don't tag executable vmalloc allocations Besides asking vmalloc memory to be executable via the prot argument of __vmalloc_node_range() (see the previous patch), the kernel can skip that bit and instead mark memory as executable via set_memory_x(). Once tag-based KASAN modes start tagging vmalloc allocations, executing code from such allocations will lead to the PC register getting a tag, which is not tolerated by the kernel. Generic kernel code typically allocates memory via module_alloc() if it intends to mark memory as executable. (On arm64 module_alloc() uses __vmalloc_node_range() without setting the executable bit). Thus, reset pointer tags of pointers returned from module_alloc(). However, on arm64 there's an exception: the eBPF subsystem. Instead of using module_alloc(), it uses vmalloc() (via bpf_jit_alloc_exec()) to allocate its JIT region. Thus, reset pointer tags of pointers returned from bpf_jit_alloc_exec(). Resetting tags for these pointers results in untagged pointers being passed to set_memory_x(). This causes conflicts in arithmetic checks in change_memory_common(), as vm_struct->addr pointer returned by find_vm_area() is tagged. Reset pointer tag of find_vm_area(addr)->addr in change_memory_common(). Link: https://lkml.kernel.org/r/b7b2595423340cd7d76b770e5d519acf3b72f0ab.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kernel/module.c | 3 ++- arch/arm64/mm/pageattr.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index d3a1fa8183487..f2d4bb14bfabe 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -63,7 +63,8 @@ void *module_alloc(unsigned long size) return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index a3bacd79507a4..64e985eaa52d8 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages, */ area = find_vm_area((void *)addr); if (!area || - end > (unsigned long)area->addr + area->size || + end > (unsigned long)kasan_reset_tag(area->addr) + area->size || !(area->flags & VM_ALLOC)) return -EINVAL; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e96d4d87291f3..2198af06ae6a7 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1150,7 +1150,8 @@ u64 bpf_jit_alloc_exec_limit(void) void *bpf_jit_alloc_exec(unsigned long size) { - return vmalloc(size); + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(vmalloc(size)); } void bpf_jit_free_exec(void *addr) From 56c115f0f69d34788175167e48d0a3b397430569 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:54 +1100 Subject: [PATCH 064/154] kasan: mark kasan_arg_stacktrace as __initdata As kasan_arg_stacktrace is only used in __init functions, mark it as __initdata instead of __ro_after_init to allow it be freed after boot. The other enums for KASAN args are used in kasan_init_hw_tags_cpu(), which is not marked as __init as a CPU can be hot-plugged after boot. Clarify this in a comment. Link: https://lkml.kernel.org/r/7fa090865614f8e0c6c1265508efb1d429afaa50.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2e9378a4f07f1..6509809dd5d8c 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -40,7 +40,7 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* Whether KASAN is enabled at all. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); @@ -116,7 +116,10 @@ static inline const char *kasan_mode_info(void) return "sync"; } -/* kasan_init_hw_tags_cpu() is called for each CPU. */ +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ void kasan_init_hw_tags_cpu(void) { /* From 040841c1692935d19f55129281611f164f9ca11f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:54 +1100 Subject: [PATCH 065/154] kasan: clean up feature flags for HW_TAGS mode - Untie kasan_init_hw_tags() code from the default values of kasan_arg_mode and kasan_arg_stacktrace. - Move static_branch_enable(&kasan_flag_enabled) to the end of kasan_init_hw_tags_cpu(). - Remove excessive comments in kasan_arg_mode switch. - Add new comments. Link: https://lkml.kernel.org/r/76ebb340265be57a218564a497e1f52ff36a3879.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 38 +++++++++++++++++++++----------------- mm/kasan/kasan.h | 2 +- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 6509809dd5d8c..6a3146d1ccc55 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -42,16 +42,22 @@ static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; -/* Whether KASAN is enabled at all. */ +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); -/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/ +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); /* kasan=off/on */ static int __init early_kasan_flag(char *arg) @@ -127,7 +133,11 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled via command line, don't initialize it. */ + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ if (kasan_arg == KASAN_ARG_OFF) return; @@ -154,42 +164,36 @@ void __init kasan_init_hw_tags(void) if (kasan_arg == KASAN_ARG_OFF) return; - /* Enable KASAN. */ - static_branch_enable(&kasan_flag_enabled); - switch (kasan_arg_mode) { case KASAN_ARG_MODE_DEFAULT: - /* - * Default to sync mode. - */ - fallthrough; + /* Default is specified by kasan_mode definition. */ + break; case KASAN_ARG_MODE_SYNC: - /* Sync mode enabled. */ kasan_mode = KASAN_MODE_SYNC; break; case KASAN_ARG_MODE_ASYNC: - /* Async mode enabled. */ kasan_mode = KASAN_MODE_ASYNC; break; case KASAN_ARG_MODE_ASYMM: - /* Asymm mode enabled. */ kasan_mode = KASAN_MODE_ASYMM; break; } switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default to enabling stack trace collection. */ - static_branch_enable(&kasan_flag_stacktrace); + /* Default is specified by kasan_flag_stacktrace definition. */ break; case KASAN_ARG_STACKTRACE_OFF: - /* Do nothing, kasan_flag_stacktrace keeps its default value. */ + static_branch_disable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); break; } + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", kasan_mode_info(), kasan_stack_collection_enabled() ? "on" : "off"); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 020f3e57a03f5..efda13a9ce6ad 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,7 +12,7 @@ #include #include "../slab.h" -DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, From b15ec419b7036eb365823f4d44b97bf5180a789b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:54 +1100 Subject: [PATCH 066/154] kasan: add kasan.vmalloc command line flag Allow disabling vmalloc() tagging for HW_TAGS KASAN via a kasan.vmalloc command line switch. This is a fail-safe switch intended for production systems that enable HW_TAGS KASAN. In case vmalloc() tagging ends up having an issue not detected during testing but that manifests in production, kasan.vmalloc allows to turn vmalloc() tagging off while leaving page_alloc/slab tagging on. Link: https://lkml.kernel.org/r/904f6d4dfa94870cc5fc2660809e093fd0d27c3b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- mm/kasan/kasan.h | 6 ++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 6a3146d1ccc55..fad1887e54c05 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -32,6 +32,12 @@ enum kasan_arg_mode { KASAN_ARG_MODE_ASYMM, }; +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -40,6 +46,7 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* @@ -56,6 +63,9 @@ EXPORT_SYMBOL(kasan_flag_enabled); enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); @@ -95,6 +105,23 @@ static int __init early_kasan_mode(char *arg) } early_param("kasan.mode", early_kasan_mode); +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -179,6 +206,18 @@ void __init kasan_init_hw_tags(void) break; } + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ + break; + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); + break; + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: /* Default is specified by kasan_flag_stacktrace definition. */ @@ -194,8 +233,9 @@ void __init kasan_init_hw_tags(void) /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", kasan_stack_collection_enabled() ? "on" : "off"); } @@ -228,6 +268,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, u8 tag; unsigned long redzone_start, redzone_size; + if (!kasan_vmalloc_enabled()) + return (void *)start; + if (!is_vmalloc_or_module_addr(start)) return (void *)start; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index efda13a9ce6ad..4d67408e84076 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,6 +12,7 @@ #include #include "../slab.h" +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { @@ -22,6 +23,11 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); From 7991da4522cd6858415dc127081fb70133db874e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:54 +1100 Subject: [PATCH 067/154] kasan: allow enabling KASAN_VMALLOC and SW/HW_TAGS Allow enabling CONFIG_KASAN_VMALLOC with SW_TAGS and HW_TAGS KASAN modes. Also adjust CONFIG_KASAN_VMALLOC description: - Mention HW_TAGS support. - Remove unneeded internal details: they have no place in Kconfig description and are already explained in the documentation. Link: https://lkml.kernel.org/r/bfa0fdedfe25f65e5caa4e410f074ddbac7a0b59.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.kasan | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 879757b6dd149..1f3e620188a24 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -178,17 +178,17 @@ config KASAN_TAGS_IDENTIFY memory consumption. config KASAN_VMALLOC - bool "Back mappings in vmalloc space with real shadow memory" - depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC + bool "Check accesses to vmalloc allocations" + depends on HAVE_ARCH_KASAN_VMALLOC help - By default, the shadow region for vmalloc space is the read-only - zero page. This means that KASAN cannot detect errors involving - vmalloc space. - - Enabling this option will hook in to vmap/vmalloc and back those - mappings with real shadow memory allocated on demand. This allows - for KASAN to detect more sorts of errors (and to support vmapped - stacks), but at the cost of higher memory usage. + This mode makes KASAN check accesses to vmalloc allocations for + validity. + + With software KASAN modes, checking is done for all types of vmalloc + allocations. Enabling this option leads to higher memory usage. + + With hardware tag-based KASAN, only VM_ALLOC mappings are checked. + There is no additional memory usage. config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS From 406e112a59dd26bb50df335d03275f893cbcb5ee Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:55 +1100 Subject: [PATCH 068/154] arm64: select KASAN_VMALLOC for SW/HW_TAGS modes Generic KASAN already selects KASAN_VMALLOC to allow VMAP_STACK to be selected unconditionally, see commit acc3042d62cb9 ("arm64: Kconfig: select KASAN_VMALLOC if KANSAN_GENERIC is enabled"). The same change is needed for SW_TAGS KASAN. HW_TAGS KASAN does not require enabling KASAN_VMALLOC for VMAP_STACK, they already work together as is. Still, selecting KASAN_VMALLOC still makes sense to make vmalloc() always protected. In case any bugs in KASAN's vmalloc() support are discovered, the command line kasan.vmalloc flag can be used to disable vmalloc() checking. Select KASAN_VMALLOC for all KASAN modes for arm64. Link: https://lkml.kernel.org/r/99d6b3ebf57fc1930ff71f9a4a71eea19881b270.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6978140edfa44..beefec5c7b900 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -205,7 +205,7 @@ config ARM64 select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOC if KASAN_GENERIC + select KASAN_VMALLOC if KASAN select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH From 8b6babb4dc085454267a3fd6a0208af57085890d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:55 +1100 Subject: [PATCH 069/154] kasan: documentation updates Update KASAN documentation: - Bump Clang version requirement for HW_TAGS as ARM64_MTE depends on AS_HAS_LSE_ATOMICS as of commit 2decad92f4731 ("arm64: mte: Ensure TIF_MTE_ASYNC_FAULT is set atomically"), which requires Clang 12. - Add description of the new kasan.vmalloc command line flag. - Mention that SW_TAGS and HW_TAGS modes now support vmalloc tagging. - Explicitly say that the "Shadow memory" section is only applicable to software KASAN modes. - Mention that shadow-based KASAN_VMALLOC is supported on arm64. Link: https://lkml.kernel.org/r/a61189128fa3f9fbcfd9884ff653d401864b8e74.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/dev-tools/kasan.rst | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 8089c559d339c..7614a1fc30fac 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -30,7 +30,7 @@ Software tag-based KASAN mode is only supported in Clang. The hardware KASAN mode (#3) relies on hardware to perform the checks but still requires a compiler version that supports memory tagging instructions. -This mode is supported in GCC 10+ and Clang 11+. +This mode is supported in GCC 10+ and Clang 12+. Both software KASAN modes work with SLUB and SLAB memory allocators, while the hardware tag-based KASAN currently only supports SLUB. @@ -206,6 +206,9 @@ additional boot parameters that allow disabling KASAN or controlling features: Asymmetric mode: a bad access is detected synchronously on reads and asynchronously on writes. +- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc + allocations (default: ``on``). + - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). @@ -279,8 +282,8 @@ Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Software tag-based KASAN currently only supports tagging of slab, page_alloc, +and vmalloc memory. Hardware tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,8 +306,8 @@ Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Hardware tag-based KASAN currently only supports tagging of slab, page_alloc, +and VM_ALLOC-based vmalloc memory. If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN will not be enabled. In this case, all KASAN boot parameters are ignored. @@ -319,6 +322,8 @@ checking gets disabled. Shadow memory ------------- +The contents of this section are only applicable to software KASAN modes. + The kernel maps memory in several different parts of the address space. The range of kernel virtual addresses is large: there is not enough real memory to support a real shadow region for every address that could be @@ -349,7 +354,7 @@ CONFIG_KASAN_VMALLOC With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the cost of greater memory usage. Currently, this is supported on x86, -riscv, s390, and powerpc. +arm64, riscv, s390, and powerpc. This works by hooking into vmalloc and vmap and dynamically allocating real shadow memory to back the mappings. From 96304a5b9bff6287fe7da9c20f253b3023553782 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Jan 2022 16:25:55 +1100 Subject: [PATCH 070/154] kasan: improve vmalloc tests Update the existing vmalloc_oob() test to account for the specifics of the tag-based modes. Also add a few new checks and comments. Add new vmalloc-related tests: - vmalloc_helpers_tags() to check that exported vmalloc helpers can handle tagged pointers. - vmap_tags() to check that SW_TAGS mode properly tags vmap() mappings. - vm_map_ram_tags() to check that SW_TAGS mode properly tags vm_map_ram() mappings. - vmalloc_percpu() to check that SW_TAGS mode tags regions allocated for __alloc_percpu(). The tagging of per-cpu mappings is best-effort; proper tagging is tracked in [1]. [1] https://bugzilla.kernel.org/show_bug.cgi?id=215019 Link: https://lkml.kernel.org/r/bbdc1c0501c5275e7f26fdb8e2a7b14a40a9f36b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 189 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 183 insertions(+), 6 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 26a5c9007653a..9dd767d052354 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -1054,21 +1055,181 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +static void vmalloc_helpers_tags(struct kunit *test) +{ + void *ptr; + int rv; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + + vfree(ptr); +} + static void vmalloc_oob(struct kunit *test) { - void *area; + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + /* - * We have to be careful not to hit the guard page. + * We have to be careful not to hit the guard page in vmalloc tests. * The MMU will catch that and crash us. */ - area = vmalloc(3000); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); - vfree(area); + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + size_t order = 1; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, order); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1 << order, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, order); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + size_t order = 1; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, order); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1 << order, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1 << order); + free_pages((unsigned long)p_ptr, order); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); } /* @@ -1102,6 +1263,18 @@ static void match_all_not_assigned(struct kunit *test) KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); free_pages((unsigned long)ptr, order); } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = (get_random_int() % 1024) + 1; + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } } /* Check that 0xff works as a match-all pointer tag for tag-based modes. */ @@ -1207,7 +1380,11 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), From b5ebaf559a5119bd833e6bf12c94d78d7fa397ee Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jan 2022 16:25:55 +1100 Subject: [PATCH 071/154] mm/memremap: avoid calling kasan_remove_zero_shadow() for device private memory For device private memory, we do not create a linear mapping for the memory because the device memory is un-accessible. Thus we do not add kasan zero shadow for it. So it's unnecessary to do kasan_remove_zero_shadow() for it. Link: https://lkml.kernel.org/r/20220126092602.1425-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index d2a72cf2ff831..d9e05952fff66 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -302,7 +302,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, return 0; err_add_memory: - kasan_remove_zero_shadow(__va(range->start), range_len(range)); + if (!is_private) + kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); err_pfn_remap: From 0fd7b606158fb0a9f52c4dfc41f8c3ef434545c5 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 27 Jan 2022 16:25:56 +1100 Subject: [PATCH 072/154] tools/vm/page_owner_sort.c: sort by stacktrace before culling The contents of page_owner have changed to include more information than the stack trace. On a modern kernel, the blocks look like Page allocated via order 0, mask 0x0(), pid 1, ts 165564237 ns, free_ts 0 ns register_early_stack+0x4b/0x90 init_page_owner+0x39/0x250 kernel_init_freeable+0x11e/0x242 kernel_init+0x16/0x130 Sorting by the contents of .txt will result in almost no repeated pages, as the pid, ts, and free_ts will almost never be the same. Instead, sort by the contents of the stack trace, which we assume to be whatever is after the first line. Link: https://lkml.kernel.org/r/20211124193709.1805776-1-seanga2@gmail.com Signed-off-by: Sean Anderson Cc: Changhee Han Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 9ebb84a9c7310..9ad3772a294dc 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -23,6 +23,7 @@ struct block_list { char *txt; + char *stacktrace; int len; int num; int page_num; @@ -51,11 +52,11 @@ int read_block(char *buf, int buf_size, FILE *fin) return -1; /* EOF or no space left in buf. */ } -static int compare_txt(const void *p1, const void *p2) +static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return strcmp(l1->txt, l2->txt); + return strcmp(l1->stacktrace ?: "", l2->stacktrace ?: ""); } static int compare_num(const void *p1, const void *p2) @@ -121,6 +122,7 @@ static void add_list(char *buf, int len) list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; + list[list_size].stacktrace = strchr(list[list_size].txt, '\n'); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -199,7 +201,7 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_txt); + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -211,7 +213,7 @@ int main(int argc, char **argv) for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].txt, list[i].txt) != 0) { + strcmp(list2[count-1].stacktrace, list[i].stacktrace) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; From 2eaf8cf582e27ba4913b778f3f7db85fe999b214 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 27 Jan 2022 16:25:56 +1100 Subject: [PATCH 073/154] tools/vm/page_owner_sort.c: fix NULL-pointer dereference when comparing stack traces If there is no newline in a block, then strchr returns NULL. We check for this in stacktrace_compare, but not when culling. Fix this (and any future bugs like it) by replacing NULL stack traces with "" in add_list. Link: https://lkml.kernel.org/r/20211125162653.1855958-1-seanga2@gmail.com Fixes: d0abbab9e9e9 ("tools/vm/page_owner_sort.c: sort by stacktrace before culling") Signed-off-by: Sean Anderson Cc: Changhee Han Cc: Zhenliang Wei Cc: Zhang Shengju Cc: Tang Bin Cc: Sean Anderson Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 9ad3772a294dc..5582d8454d3bd 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -56,7 +56,7 @@ static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return strcmp(l1->stacktrace ?: "", l2->stacktrace ?: ""); + return strcmp(l1->stacktrace, l2->stacktrace); } static int compare_num(const void *p1, const void *p2) @@ -122,7 +122,7 @@ static void add_list(char *buf, int len) list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; - list[list_size].stacktrace = strchr(list[list_size].txt, '\n'); + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); From af4a279ec0894c0c234af3d4bbb5e838bc5350ca Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 27 Jan 2022 16:25:56 +1100 Subject: [PATCH 074/154] tools/vm/page_owner_sort.c: support sorting by stack trace This adds the ability to sort by stacktraces. This is helpful when comparing multiple dumps of page_owner taken at different times, since blocks will not be reordered if they were allocated/free'd. Link: https://lkml.kernel.org/r/20211124193709.1805776-2-seanga2@gmail.com Signed-off-by: Sean Anderson Cc: Zhenliang Wei Cc: Changhee Han Cc: Tang Bin Cc: Zhang Shengju Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 5582d8454d3bd..1b2acf02d3cd6 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -29,7 +29,6 @@ struct block_list { int page_num; }; -static int sort_by_memory; static regex_t order_pattern; static struct block_list *list; static int list_size; @@ -134,13 +133,16 @@ static void add_list(char *buf, int len) static void usage(void) { - printf("Usage: ./page_owner_sort [-m] \n" - "-m Sort by total memory. If this option is unset, sort by times\n" + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m Sort by total memory.\n" + "-s Sort by the stack trace.\n" + "-t Sort by times (default).\n" ); } int main(int argc, char **argv) { + int (*cmp)(const void *, const void *) = compare_num; FILE *fin, *fout; char *buf; int ret, i, count; @@ -149,10 +151,16 @@ int main(int argc, char **argv) int err; int opt; - while ((opt = getopt(argc, argv, "m")) != -1) + while ((opt = getopt(argc, argv, "mst")) != -1) switch (opt) { case 'm': - sort_by_memory = 1; + cmp = compare_page_num; + break; + case 's': + cmp = compare_stacktrace; + break; + case 't': + cmp = compare_num; break; default: usage(); @@ -221,10 +229,7 @@ int main(int argc, char **argv) } } - if (sort_by_memory) - qsort(list2, count, sizeof(list[0]), compare_page_num); - else - qsort(list2, count, sizeof(list[0]), compare_num); + qsort(list2, count, sizeof(list[0]), cmp); for (i = 0; i < count; i++) fprintf(fout, "%d times, %d pages:\n%s\n", From 7243475462026fb1d9b5bd047d9aff0eb9c1789e Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Thu, 27 Jan 2022 16:25:56 +1100 Subject: [PATCH 075/154] tools/vm/page_owner_sort.c: add switch between culling by stacktrace and txt Culling by comparing stacktrace would casue loss of some information. For example, if there exists 2 blocks which have the same stacktrace and the different head info Page allocated via order 0, mask 0x108c48(...), pid 73696, ts 1578829190639010 ns, free_ts 1576583851324450 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 Page allocated via order 0, mask 0x108c48(...), pid 61806, ts 1354113726046100 ns, free_ts 1354104926841400 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 After culling, it would be like this 2 times, 2 pages: Page allocated via order 0, mask 0x108c48(...), pid 73696, ts 1578829190639010 ns, free_ts 1576583851324450 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 The info of second block missed. So, add -c to turn on culling by stacktrace. By default, it will cull by txt. Link: https://lkml.kernel.org/r/20211129145658.2491-1-zhangyinan2019@email.szu.edu.cn Signed-off-by: Yinan Zhang Cc: Changhee Han Cc: Sean Anderson Cc: Stephen Rothwell Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 1b2acf02d3cd6..492be7f752c04 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -51,6 +51,13 @@ int read_block(char *buf, int buf_size, FILE *fin) return -1; /* EOF or no space left in buf. */ } +static int compare_txt(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->txt, l2->txt); +} + static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -137,12 +144,14 @@ static void usage(void) "-m Sort by total memory.\n" "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" + "-c cull by comparing stacktrace instead of total block.\n" ); } int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; + int cull_st = 0; FILE *fin, *fout; char *buf; int ret, i, count; @@ -151,7 +160,7 @@ int main(int argc, char **argv) int err; int opt; - while ((opt = getopt(argc, argv, "mst")) != -1) + while ((opt = getopt(argc, argv, "mstc")) != -1) switch (opt) { case 'm': cmp = compare_page_num; @@ -162,6 +171,9 @@ int main(int argc, char **argv) case 't': cmp = compare_num; break; + case 'c': + cull_st = 1; + break; default: usage(); exit(1); @@ -209,7 +221,10 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + if (cull_st == 1) + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + else + qsort(list, list_size, sizeof(list[0]), compare_txt); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -219,9 +234,11 @@ int main(int argc, char **argv) printf("culling\n"); + long offset = cull_st ? &list[0].stacktrace - &list[0].txt : 0; + for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].stacktrace, list[i].stacktrace) != 0) { + strcmp(*(&list2[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; From 0828aa0a38635817dcdd95f34f2c3fcffc73a8b4 Mon Sep 17 00:00:00 2001 From: Chongxi Zhao Date: Thu, 27 Jan 2022 16:25:57 +1100 Subject: [PATCH 076/154] tools/vm/page_owner_sort.c: support sorting pid and time When viewing the page owner information, we expect that the information can be sorted by PID, so that we can quickly combine PID with the program to check the information together. We also expect that the information can be sorted by time. Time sorting helps to view the running status of the program according to the time interval when the program hangs up. Finally, we hope to pass the page_ owner_ Sort. C can reduce part of the output and only output the plate information whose memory has not been released, which can make us locate the problem of the program faster. Therefore, the following adjustments have been made: 1. Add the static functions search_pattern and check_regcomp to improve the cleanliness. 2. Add member attributes and their corresponding sorting methods. In terms of comparison time, int will overflow because the data of ull is too large, so the ternary operator is used 3. Add the -f parameter to filter out the information of blocks whose memory has not been released Link: https://lkml.kernel.org/r/20211206165653.5093-1-zhaochongxi2019@email.szu.edu.cn Signed-off-by: Chongxi Zhao Reviewed-by: Sean Anderson Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 177 +++++++++++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 29 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 492be7f752c04..c9fedc1806d50 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -20,6 +20,7 @@ #include #include #include +#include struct block_list { char *txt; @@ -27,9 +28,15 @@ struct block_list { int len; int num; int page_num; + pid_t pid; + __u64 ts_nsec; + __u64 free_ts_nsec; }; static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; @@ -79,34 +86,124 @@ static int compare_page_num(const void *p1, const void *p2) return l2->page_num - l1->page_num; } -static int get_page_num(char *buf) +static int compare_pid(const void *p1, const void *p2) { - int err, val_len, order_val; - char order_str[4] = {0}; - char *endptr; + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; regmatch_t pmatch[2]; - err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL); + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); if (err != 0 || pmatch[1].rm_so == -1) { - printf("no order pattern in %s\n", buf); - return 0; + printf("no matching pattern in %s\n", buf); + return -1; } val_len = pmatch[1].rm_eo - pmatch[1].rm_so; - if (val_len > 2) /* max_order should not exceed 2 digits */ - goto wrong_order; - memcpy(order_str, buf + pmatch[1].rm_so, val_len); + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} + +static void check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + printf("Invalid pattern %s code %d\n", regex, err); + exit(1); + } +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + search_pattern(&order_pattern, order_str, buf); errno = 0; order_val = strtol(order_str, &endptr, 10); - if (errno != 0 || endptr == order_str || *endptr != '\0') - goto wrong_order; + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + printf("wrong order in follow buf:\n%s\n", buf); + return 0; + } return 1 << order_val; +} -wrong_order: - printf("wrong order in follow buf:\n%s\n", buf); - return 0; +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + printf("wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; + +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + printf("wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + printf("wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; } static void add_list(char *buf, int len) @@ -129,6 +226,11 @@ static void add_list(char *buf, int len) memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + list[list_size].pid = get_pid(buf); + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -144,6 +246,9 @@ static void usage(void) "-m Sort by total memory.\n" "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" + "-p Sort by pid.\n" + "-a Sort by memory allocate time.\n" + "-r Sort by memory release time.\n" "-c cull by comparing stacktrace instead of total block.\n" ); } @@ -152,28 +257,40 @@ int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; int cull_st = 0; + int filter = 0; FILE *fin, *fout; char *buf; int ret, i, count; struct block_list *list2; struct stat st; - int err; int opt; - while ((opt = getopt(argc, argv, "mstc")) != -1) + while ((opt = getopt(argc, argv, "acfmprst")) != -1) switch (opt) { + case 'a': + cmp = compare_ts; + break; + case 'c': + cull_st = 1; + break; + case 'f': + filter = 1; + break; case 'm': cmp = compare_page_num; break; + case 'p': + cmp = compare_pid; + break; + case 'r': + cmp = compare_free_ts; + break; case 's': cmp = compare_stacktrace; break; case 't': cmp = compare_num; break; - case 'c': - cull_st = 1; - break; default: usage(); exit(1); @@ -192,13 +309,10 @@ int main(int argc, char **argv) exit(1); } - err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE); - if (err != 0 || order_pattern.re_nsub != 1) { - printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n", - argv[0], err); - exit(1); - } - + check_regcomp(&order_pattern, "order\\s*([0-9]*),"); + check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); + check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); + check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -248,10 +362,15 @@ int main(int argc, char **argv) qsort(list2, count, sizeof(list[0]), cmp); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { + if (filter == 1 && list2[i].free_ts_nsec != 0) + continue; fprintf(fout, "%d times, %d pages:\n%s\n", list2[i].num, list2[i].page_num, list2[i].txt); - + } regfree(&order_pattern); + regfree(&pid_pattern); + regfree(&ts_nsec_pattern); + regfree(&free_ts_nsec_pattern); return 0; } From 17ae1917ab3afa5ee2e7ad5afd7c88a9a0cfce2d Mon Sep 17 00:00:00 2001 From: Shenghong Han Date: Thu, 27 Jan 2022 16:25:57 +1100 Subject: [PATCH 077/154] tools/vm/page_owner_sort.c: two trivial fixes 1) There is an unused variable. It's better to delete it. 2) One case is missing in the usage(). Link: https://lkml.kernel.org/r/20211213164518.2461-1-hanshenghong2019@email.szu.edu.cn Signed-off-by: Shenghong Han Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c9fedc1806d50..284a5070402c3 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -41,8 +41,6 @@ static struct block_list *list; static int list_size; static int max_size; -struct block_list *block_head; - int read_block(char *buf, int buf_size, FILE *fin) { char *curr = buf, *const buf_end = buf + buf_size; @@ -249,7 +247,8 @@ static void usage(void) "-p Sort by pid.\n" "-a Sort by memory allocate time.\n" "-r Sort by memory release time.\n" - "-c cull by comparing stacktrace instead of total block.\n" + "-c Cull by comparing stacktrace instead of total block.\n" + "-f Filter out the information of blocks whose memory has not been released.\n" ); } From 87f0d8ca64f15aebf55f98e379032330128a860c Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Thu, 27 Jan 2022 16:25:57 +1100 Subject: [PATCH 078/154] tools/vm/page_owner_sort.c: delete invalid duplicate code I noticed that there is two invalid lines of duplicate code. It's better to delete it. Link: https://lkml.kernel.org/r/20211213095743.3630-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Mark Brown Cc: Sean Anderson Cc: Zhenliang Wei Cc: Tang Bin Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 284a5070402c3..c8ec2d6b314de 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -227,8 +227,6 @@ static void add_list(char *buf, int len) list[list_size].pid = get_pid(buf); list[list_size].ts_nsec = get_ts_nsec(buf); list[list_size].free_ts_nsec = get_free_ts_nsec(buf); - memcpy(list[list_size].txt, buf, len); - list[list_size].txt[len] = 0; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); From a789c34f683222f29fba2b19ee7d9b30730fd655 Mon Sep 17 00:00:00 2001 From: Shenghong Han Date: Thu, 27 Jan 2022 16:25:57 +1100 Subject: [PATCH 079/154] Documentation/vm/page_owner.rst: update the documentation Update the documentation of ``page_owner``. Link: https://lkml.kernel.org/r/20211214134736.2569-1-hanshenghong2019@email.szu.edu.cn Signed-off-by: Shenghong Han Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Georgi Djakov Cc: Liam Mark Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/page_owner.rst | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 9837fc8147dd6..7a28e7b0d9c29 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -97,7 +97,7 @@ Usage The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the times. + and pages of buf, and finally sorts them according to the parameter(s). See the result about who allocated each page in the ``sorted_page_owner.txt``. General output: @@ -108,3 +108,22 @@ Usage By default, ``page_owner_sort`` is sorted according to the times of buf. If you want to sort by the pages nums of buf, use the ``-m`` parameter. + The detail parameters are shown as follows: + + fundamental function: + + Sort: + -a Sort by memory allocate time. + -m Sort by total memory. + -p Sort by pid. + -r Sort by memory release time. + -s Sort by the stack trace. + -t Sort by times (default). + + additional function: + + Cull: + -c Cull by comparing stacktrace instead of total block. + + Filter: + -f Filter out the information of blocks whose memory has not been released. From 7188314addcc9389b5f26df86ccb706690be9eff Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 27 Jan 2022 16:25:58 +1100 Subject: [PATCH 080/154] documentation-vm-page_ownerrst-update-the-documentation-fix small grammatical tweaks Cc: Georgi Djakov Cc: Jonathan Corbet Cc: Liam Mark Cc: Shenghong Han Cc: Tang Bin Cc: Vlastimil Babka Cc: Xiaoming Ni Cc: Zhang Shengju Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/page_owner.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 7a28e7b0d9c29..602cf6eefcb5f 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -107,17 +107,17 @@ Usage // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the pages nums of buf, use the ``-m`` parameter. - The detail parameters are shown as follows: + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: fundamental function: Sort: - -a Sort by memory allocate time. + -a Sort by memory allocation time. -m Sort by total memory. -p Sort by pid. -r Sort by memory release time. - -s Sort by the stack trace. + -s Sort by stack trace. -t Sort by times (default). additional function: From f9b7a4a8d2c608d296e3efadc8d5947a020aa9d7 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 27 Jan 2022 16:25:58 +1100 Subject: [PATCH 081/154] Documentation/vm/page_owner.rst: fix unexpected indentation warns Fix Unexpected indentation warns in page_owner: Documentation/vm/page_owner.rst:92: WARNING: Unexpected indentation. Documentation/vm/page_owner.rst:96: WARNING: Unexpected indentation. Documentation/vm/page_owner.rst:107: WARNING: Unexpected indentation. Link: https://lkml.kernel.org/r/20211215001929.47866-1-skhan@linuxfoundation.org Signed-off-by: Shuah Khan Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/page_owner.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 602cf6eefcb5f..2b54e82b9fe15 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -89,11 +89,11 @@ Usage Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times @@ -104,7 +104,7 @@ Usage XXX times, XXX pages: Page allocated via order XXX, ... - // Detailed stack + // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. If you want to sort by the page nums of buf, use the ``-m`` parameter. From 1006c7b648e5c954b29d840249f92934ec6dbb3d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 27 Jan 2022 16:25:58 +1100 Subject: [PATCH 082/154] mm: generalize ARCH_HAS_FILTER_PGPROT ARCH_HAS_FILTER_PGPROT config has duplicate definitions on platforms that subscribe it. Instead make it a generic config option which can be selected on applicable platforms when required. Link: https://lkml.kernel.org/r/1643004823-16441-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 3 --- mm/Kconfig | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index beefec5c7b900..8306b51e84246 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,6 +23,7 @@ config ARM64 select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE @@ -1166,9 +1167,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_HAS_FILTER_PGPROT - def_bool y - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ebe8fc76949af..5f33a10d3dcca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -332,9 +332,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_HAS_CPU_RELAX def_bool y -config ARCH_HAS_FILTER_PGPROT - def_bool y - config ARCH_HIBERNATION_POSSIBLE def_bool y diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f33..257ed9c86de34 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -744,6 +744,9 @@ config IDLE_PAGE_TRACKING config ARCH_HAS_CACHE_LINE_SIZE bool +config ARCH_HAS_FILTER_PGPROT + bool + config ARCH_HAS_PTE_DEVMAP bool From 04c852a0bb33f3cd36d09b2e058ee38461fa2320 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jan 2022 16:25:58 +1100 Subject: [PATCH 083/154] mm/vmalloc: remove unneeded function forward declaration The forward declaration for lazy_max_pages() is unnecessary. Remove it. Link: https://lkml.kernel.org/r/20220124133752.60663-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 375b53fd939f1..0dedb770e2f47 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -792,7 +792,6 @@ RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); -static unsigned long lazy_max_pages(void); static atomic_long_t nr_vmalloc_pages; From 388f7912f1aad061d210679756f6765bd2505e29 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 27 Jan 2022 16:25:59 +1100 Subject: [PATCH 084/154] mm/vmalloc.c: vmap(): don't allow invalid pages vmap() takes struct page *pages as one of arguments, and user may provide an invalid pointer which would lead to data abort at address translation later. Currently, kernel checks the pages against NULL. In my case, however, the address was not NULL, and was big enough so that the hardware generated Address Size Abort on arm64. Interestingly, this abort happens even if copy_from_kernel_nofault() is used, which is quite inconvenient for debugging purposes. This patch adds a pfn_valid() check into vmap() path, so that invalid mapping will not be created. Link: https://lkml.kernel.org/r/20220119012109.551931-1-yury.norov@gmail.com Signed-off-by: Yury Norov Suggested-by: Matthew Wilcox (Oracle) Cc: Catalin Marinas Cc: Will Deacon Cc: Nicholas Piggin Cc: Ding Tianhong Cc: Anshuman Khandual Cc: Matthew Wilcox Cc: Alexey Klimov Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0dedb770e2f47..028ce2b9bbf97 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -479,6 +479,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); From e60dc10f0b4dc1e00529118174a8f1b9687f7b39 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Thu, 27 Jan 2022 16:25:59 +1100 Subject: [PATCH 085/154] mm/util.c: make kvfree() safe for calling while holding spinlocks One codepath in find_alloc_undo() calls kvfree() while holding a spinlock. Since vfree() can sleep this is a bug. Previously, the code path used kfree(), and kfree() is safe to be called while holding a spinlock. Minghao proposed to fix this by updating find_alloc_undo(). Alternate proposal to fix this: Instead of changing find_alloc_undo(), change kvfree() so that the same rules as for kfree() apply: Having different rules for kfree() and kvfree() just asks for bugs. Disadvantage: Releasing vmalloc'ed memory will be delayed a bit. Link: https://lkml.kernel.org/r/20211222194828.15320-1-manfred@colorfullife.com Link: https://lore.kernel.org/all/20211222081026.484058-1-chi.minghao@zte.com.cn/ Fixes: fc37a3b8b438 ("[PATCH] ipc sem: use kvmalloc for sem_undo allocation") Signed-off-by: Manfred Spraul Reported-by: Zeal Robot Reported-by: Minghao Chi Cc: Vasily Averin Cc: CGEL ZTE Cc: Shakeel Butt Cc: Randy Dunlap Cc: Davidlohr Bueso Cc: Bhaskar Chowdhury Cc: Arnd Bergmann Cc: Uladzislau Rezki Cc: Michal Hocko Cc: <1vier1@web.de> Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/util.c b/mm/util.c index 7e43369064c86..75dfc98531a81 100644 --- a/mm/util.c +++ b/mm/util.c @@ -603,12 +603,12 @@ EXPORT_SYMBOL(kvmalloc_node); * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * - * Context: Either preemptible task context or not-NMI interrupt. + * Context: Any context except NMI interrupt. */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) - vfree(addr); + vfree_atomic(addr); else kfree(addr); } From 4c3845df7b97fd71a33e06379cba6d5a46fd2196 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 27 Jan 2022 16:25:59 +1100 Subject: [PATCH 086/154] mm: page_alloc: avoid merging non-fallbackable pageblocks with others This is done in addition to MIGRATE_ISOLATE pageblock merge avoidance. It prepares for the upcoming removal of the MAX_ORDER-1 alignment requirement for CMA and alloc_contig_range(). MIGRATE_HIGHATOMIC should not merge with other migratetypes like MIGRATE_ISOLATE and MIGRARTE_CMA[1], so this commit prevents that too. Remove MIGRATE_CMA and MIGRATE_ISOLATE from fallbacks list, since they are never used. [1] https://lore.kernel.org/linux-mm/20211130100853.GP3366@techsingularity.net/ Link: https://lkml.kernel.org/r/20220124175957.1261961-1-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: Mel Gorman Acked-by: David Hildenbrand Cc: Vlastimil Babka Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 11 +++++++++++ mm/page_alloc.c | 44 ++++++++++++++++++++---------------------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aed44e9b5d899..71b77aab748da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,6 +83,17 @@ static inline bool is_migrate_movable(int mt) return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } +/* + * Check whether a migratetype can be merged with another migratetype. + * + * It is only mergeable when it can fall back to other migratetypes for + * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. + */ +static inline bool migratetype_is_mergeable(int mt) +{ + return mt < MIGRATE_PCPTYPES; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 368c6c5bf42a9..b72898d79e618 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1099,25 +1099,24 @@ static inline void __free_one_page(struct page *page, } if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on isolate - * pageblock and normal pageblock. Without this, pageblock - * isolation could cause incorrect freepage or CMA accounting. + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. * * We don't want to hit this code for the more frequent * low-order merging. */ - if (unlikely(has_isolate_pageblock(zone))) { - int buddy_mt; + int buddy_mt; - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - buddy_mt = get_pageblock_migratetype(buddy); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + buddy_mt = get_pageblock_migratetype(buddy); - if (migratetype != buddy_mt - && (is_migrate_isolate(migratetype) || - is_migrate_isolate(buddy_mt))) - goto done_merging; - } + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; max_order = order + 1; goto continue_merging; } @@ -2535,17 +2534,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted + * + * The other migratetypes do not have fallbacks. */ static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, -#ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ -#endif }; #ifdef CONFIG_CMA @@ -2851,8 +2846,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) - && !is_migrate_cma(mt)) { + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); @@ -3601,8 +3596,11 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && !is_migrate_highatomic(mt)) + /* + * Only change normal pageblocks (i.e., they can merge + * with others) + */ + if (migratetype_is_mergeable(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } From c89f85d2b21a7248b247c249e85d2b66208e53ea Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 27 Jan 2022 16:25:59 +1100 Subject: [PATCH 087/154] mm/page_alloc: adding same penalty is enough to get round-robin order To make node order in round-robin in the same distance group, we add a penalty to the first node we got in each round. To get a round-robin order in the same distance group, we don't need to decrease the penalty since: * find_next_best_node() always iterates node in the same order * distance matters more then penalty in find_next_best_node() * in nodes with the same distance, the first one would be picked up So it is fine to increase same penalty when we get the first node in the same distance group. Link: https://lkml.kernel.org/r/20220123013537.20491-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Krupa Ramakrishnan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b72898d79e618..ea5202e672774 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6319,13 +6319,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; - int node, load, nr_nodes = 0; + int node, nr_nodes = 0; nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = nr_online_nodes; prev_node = local_node; memset(node_order, 0, sizeof(node_order)); @@ -6337,11 +6336,10 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] += nr_online_nodes; node_order[nr_nodes++] = node; prev_node = node; - load--; } build_zonelists_in_node_order(pgdat, node_order, nr_nodes); From 81a1143af90575c4a2a05508a54ea33042b34604 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 27 Jan 2022 16:26:00 +1100 Subject: [PATCH 088/154] mm/page_alloc: add penalty to local_node Commit 54d032ced983 ("mm/page_alloc: use accumulated load when building node fallback list") fixed a bug on zonelist order. This made me think about what would happen if we have a node system with the following distance matrix. Node 0 1 2 3 4 5 6 7 ---------------------------- 0 10 12 12 12 32 32 32 32 1 12 10 12 12 32 32 32 32 2 12 12 10 12 32 32 32 32 3 12 12 12 10 32 32 32 32 4 32 32 32 32 10 12 12 12 5 32 32 32 32 12 10 12 12 6 32 32 32 32 12 12 10 12 7 32 32 32 32 12 12 12 10 Unfortunately for this case, the node fallback list gets built like this: Node Fallback list --------------------- 0: 0 1 2 3 4 5 6 7 1: 1 0 2 3 5 6 7 4 2: 2 3 0 1 6 7 4 5 3: 3 2 0 1 7 4 5 6 4: 4 5 6 7 0 1 2 3 5: 5 4 6 7 1 2 3 0 6: 6 7 4 5 2 3 0 1 7: 7 6 4 5 3 0 1 2 We found the order in diagonal block is not expected. The reason is we don't penaltize local node. After penalizing local node, the node fallback list gets built like this: Node Fallback list --------------------- 0: 0 1 2 3 4 5 6 7 1: 1 2 3 0 5 6 7 4 2: 2 3 0 1 6 7 4 5 3: 3 0 1 2 7 4 5 6 4: 4 5 6 7 0 1 2 3 5: 5 6 7 4 1 2 3 0 6: 6 7 4 5 2 3 0 1 7: 7 4 5 6 3 0 1 2 Now the fallback list is in round-robin order. I am not very familiar with the node distance pattern, while I tried the following distance matrix. Both of them works with this change. Node 0 1 2 3 ---------------- 0 10 10 10 10 1 10 10 10 10 2 10 10 10 10 3 10 10 10 10 Node 0 1 2 3 4 5 6 7 ---------------------------- 0 10 10 10 10 32 32 32 32 1 10 10 10 10 32 32 32 32 2 10 10 10 10 32 32 32 32 3 10 10 10 10 32 32 32 32 4 32 32 32 32 10 10 10 10 5 32 32 32 32 10 10 10 10 6 32 32 32 32 10 10 10 10 7 32 32 32 32 10 10 10 10 Link: https://lkml.kernel.org/r/20220123013537.20491-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Krupa Ramakrishnan Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea5202e672774..229ea7f317325 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6334,8 +6334,9 @@ static void build_zonelists(pg_data_t *pgdat) * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + if ((node_distance(local_node, node) != + node_distance(local_node, prev_node)) || + node == local_node) node_load[node] += nr_online_nodes; node_order[nr_nodes++] = node; From 8e9ebb088abd942cb00e08ddd7199832af3da243 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 27 Jan 2022 16:26:00 +1100 Subject: [PATCH 089/154] mm/mmzone.c: use try_cmpxchg() in page_cpupid_xchg_last() This will let us avoid an additional read from page->flags when retrying the compare-exchange on some architectures. Link: https://lkml.kernel.org/r/20220120011200.1322836-1-pcc@google.com Link: https://linux-review.googlesource.com/id/I2e1f5b5b080ac9c4e0eb7f98768dba6fd7821693 Signed-off-by: Peter Collingbourne Suggested-by: Peter Zijlstra Cc: Andrey Konovalov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmzone.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/mmzone.c b/mm/mmzone.c index eb89d6e018e29..d8a9b0e1b5267 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -89,13 +89,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid) unsigned long old_flags, flags; int last_cpupid; + old_flags = READ_ONCE(page->flags); do { - old_flags = flags = page->flags; - last_cpupid = page_cpupid_last(page); + flags = old_flags; + last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); return last_cpupid; } From 7a490acf65a9bf82185e95162f4e61102ac212cc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 27 Jan 2022 16:26:00 +1100 Subject: [PATCH 090/154] mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - causes __GFP_HIGH to set ALLOC_HARDER unless __GFP_NOMEMALLOC is set (as well as ALLOC_HIGH). - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Signed-off-by: NeilBrown Reviewed-by: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thierry Reding Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/memcontrol.c | 2 +- mm/page_alloc.c | 16 ++++------------ tools/perf/builtin-kmem.c | 1 - tools/testing/radix-tree/linux/gfp.h | 3 +-- 10 files changed, 18 insertions(+), 33 deletions(-) diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst index 6a1fadf3e1735..e38e9d83c1c72 100644 --- a/Documentation/vm/balance.rst +++ b/Documentation/vm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e900e3c46903b..c5fa8b8673b6a 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -676,12 +676,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7797c915ce54c..6eef3e4475401 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,7 +39,7 @@ struct vm_area_struct; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -124,11 +124,8 @@ struct vm_area_struct; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -143,7 +140,6 @@ struct vm_area_struct; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -339,7 +335,7 @@ struct vm_area_struct; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 6532119a6bf1a..0698c5d0f1947 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -29,7 +29,6 @@ {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ - {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ diff --git a/lib/test_printf.c b/lib/test_printf.c index 07309c45f3279..8010de49b6c5d 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -673,17 +673,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index d80300392a194..4c2d06a2f50b6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09d342c7cbd0d..d067366002e65 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2575,7 +2575,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, * put the burden of reclaim on regular allocation requests * and let these go through as privileged allocations. */ - if (gfp_mask & __GFP_ATOMIC) + if (gfp_mask & __GFP_HIGH) goto force; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 229ea7f317325..343724c57a2cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4002,12 +4002,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4735,12 +4735,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -4933,14 +4933,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - retry_cpuset: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 99d7ff9a8effe..e5b38d0b08fb5 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -640,7 +640,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index 32159c08a52e5..0a0741104dfeb 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h @@ -12,7 +12,6 @@ #define __GFP_FS 0x80u #define __GFP_NOWARN 0x200u #define __GFP_ZERO 0x8000u -#define __GFP_ATOMIC 0x80000u #define __GFP_ACCOUNT 0x100000u #define __GFP_DIRECT_RECLAIM 0x400000u #define __GFP_KSWAPD_RECLAIM 0x2000000u @@ -20,7 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) #define GFP_ZONEMASK 0x0fu -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) From 56dd086635a59ea88b4bf7624106ebfe9503075b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 27 Jan 2022 16:26:01 +1100 Subject: [PATCH 091/154] mm/memory-failure.c: remove obsolete comment With the introduction of mf_mutex, most of memory error handling process is mutually exclusive, so the in-line comment about subtlety about double-checking PageHWPoison is no more correct. So remove it. Link: https://lkml.kernel.org/r/20220125025601.3054511-1-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Suggested-by: Mike Kravetz Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97a9ed8f87a96..0f6413a2f3016 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2150,12 +2150,6 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - /* - * Check PageHWPoison again inside page lock because PageHWPoison - * is set by memory_failure() outside page lock. Note that - * memory_failure() also double-checks PageHWPoison inside page lock, - * so there's no race between soft_offline_page() and memory_failure(). - */ lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); From d6bf5843eaef738e0675c0339f43fcfc4b513965 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 27 Jan 2022 16:26:01 +1100 Subject: [PATCH 092/154] mm/hwpoison: fix error page recovered but reported "not recovered" When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed. If the CMCI wins that race, the page is marked poisoned when uc_decode_notifier() calls memory_failure() and the machine check processing code finds the page already poisoned. It calls kill_accessing_process() to make sure a SIGBUS is sent. But returns the wrong error code. Console log looks like this: [34775.674296] mce: Uncorrected hardware memory error in user-access at 3710b3400 [34775.675413] Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered [34775.690310] Memory failure: 0x3710b3: already hardware poisoned [34775.696247] Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption [34775.706072] mce: Memory error not recovered kill_accessing_process() is supposed to return -EHWPOISON to notify that SIGBUS is already set to the process and kill_me_maybe() doesn't have to send it again. But current code simply fails to do this, so fix it to make sure to work as intended. This change avoids the noise message "Memory error not recovered" and skips duplicate SIGBUSs. [tony.luck@intel.com: reword some parts of commit message] Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Naoya Horiguchi Reported-by: Youquan Song Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0f6413a2f3016..2e2f740c63dc7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -707,8 +707,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { From 99dac6b0a661f985c86dbb514036a84844ebff94 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 27 Jan 2022 16:26:01 +1100 Subject: [PATCH 093/154] mm: hugetlb: free the 2nd vmemmap page associated with each HugeTLB page Patch series "Free the 2nd vmemmap page associated with each HugeTLB page", v7. This series can minimize the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB. It is a nice gain. Comments and reviews are welcome. Thanks. The main implementation and details can refer to the commit log of patch 1. In this series, I have changed the following four helpers, the following table shows the impact of the overhead of those helpers. +------------------+-----------------------+ | APIs | head page | tail page | +------------------+-----------+-----------+ | PageHead() | Y | N | +------------------+-----------+-----------+ | PageTail() | Y | N | +------------------+-----------+-----------+ | PageCompound() | N | N | +------------------+-----------+-----------+ | compound_head() | Y | N | +------------------+-----------+-----------+ Y: Overhead is increased. N: Overhead is _NOT_ increased. It shows that the overhead of those helpers on a tail page don't change between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off". But the overhead on a head page will be increased when "hugetlb_free_vmemmap=on" (except PageCompound()). So I believe that Matthew Wilcox's folio series will help with this. The users of PageHead() and PageTail() are much less than compound_head() and most users of PageTail() are VM_BUG_ON(), so I have done some tests about the overhead of compound_head() on head pages. I have tested the overhead of calling compound_head() on a head page, which is 2.11ns (Measure the call time of 10 million times compound_head(), and then average). For a head page whose address is not aligned with PAGE_SIZE or a non-compound page, the overhead of compound_head() is 2.54ns which is increased by 20%. For a head page whose address is aligned with PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by 40%. Most pages are the former. I do not think the overhead is significant since the overhead of compound_head() itself is low. This patch (of 5): This patch minimizes the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB (2MB type). After the feature of "Free sonme vmemmap pages of HugeTLB page" is enabled, the mapping of the vmemmap addresses associated with a 2MB HugeTLB page becomes the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | ----------------^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | 3 | ------------------+ | | | | | | +-----------+ | | | | | | | 4 | --------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | ----------------------+ | | | | +-----------+ | | | | | 6 | ------------------------+ | | | +-----------+ | | | | 7 | --------------------------+ | | +-----------+ | | | | | | +-----------+ As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and remaped. However, the 2nd vmemmap page frame is also can be freed to the buddy allocator, then we can change the mapping from the figure above to the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | | 2 | -----------------+ | | | | | | | +-----------+ | | | | | | | | 3 | -------------------+ | | | | | | +-----------+ | | | | | | | 4 | ---------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | -----------------------+ | | | | +-----------+ | | | | | 6 | -------------------------+ | | | +-----------+ | | | | 7 | ---------------------------+ | | +-----------+ | | | | | | +-----------+ After we do this, all tail vmemmap pages (1-7) are mapped to the head vmemmap page frame (0). In other words, there are more than one page struct with PG_head associated with each HugeTLB page. We __know__ that there is only one head page struct, the tail page structs with PG_head are fake head page structs. We need an approach to distinguish between those two different types of page structs so that compound_head(), PageHead() and PageTail() can work properly if the parameter is the tail page struct but with PG_head. The following code snippet describes how to distinguish between real and fake head page struct. if (test_bit(PG_head, &page->flags)) { unsigned long head = READ_ONCE(page[1].compound_head); if (head & 1) { if (head == (unsigned long)page + 1) ==> head page struct else ==> tail page struct } else ==> head page struct } We can safely access the field of the @page[1] with PG_head because the @page is a compound page composed with at least two contiguous pages. Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Mike Kravetz Cc: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Chen Huang Cc: Bodeddula Balasubramaniam Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Xiongchun Duan Cc: Fam Zheng Cc: Qi Zheng Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/kernel-parameters.txt | 2 +- include/linux/page-flags.h | 78 ++++++++++++++++++- mm/hugetlb_vmemmap.c | 62 ++++++++------- mm/sparse-vmemmap.c | 21 +++++ 4 files changed, 130 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f5a27f067db9e..85f096fddad9d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1625,7 +1625,7 @@ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). + memory (7 * PAGE_SIZE for each 2MB hugetlb page). Format: { on | off (default) } on: enable the feature diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1c3b6e5c8bfd3..111e453f23d22 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,13 +190,69 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; + +/* + * If the feature of freeing some vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_free_vmemmap_enabled) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; - return (unsigned long)page; + return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) @@ -231,12 +287,13 @@ static inline unsigned long _compound_head(const struct page *page) static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l @@ -695,7 +752,20 @@ static inline bool test_set_page_writeback(struct page *page) return set_page_writeback(page); } -__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline bool folio_test_head(struct folio *folio) +{ + return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); +} + +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5b..4977f5a520c22 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt @@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +bool hugetlb_free_vmemmap_enabled __read_mostly = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); return 0; } @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head); @@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index db6df27c852a7..e881f5db70915 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } From d513cc29f6ace6c8f639c1a80a460ecddf447a75 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 27 Jan 2022 16:26:01 +1100 Subject: [PATCH 094/154] mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key The page_fixed_fake_head() is used throughout memory management and the conditional check requires checking a global variable, although the overhead of this check may be small, it increases when the memory cache comes under pressure. Also, the global variable will not be modified after system boot, so it is very appropriate to use static key machanism. Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/hugetlb.h | 6 ------ include/linux/page-flags.h | 16 ++++++++++++++-- mm/hugetlb_vmemmap.c | 12 ++++++------ mm/memory_hotplug.c | 2 +- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1897a69c5406..a25312274ec03 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1075,12 +1075,6 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 111e453f23d22..340cb81565683 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -191,7 +191,14 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); + +static __always_inline bool hugetlb_free_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + &hugetlb_free_vmemmap_enabled_key); +} /* * If the feature of freeing some vmemmap pages associated with each HugeTLB @@ -211,7 +218,7 @@ extern bool hugetlb_free_vmemmap_enabled; */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return page; /* @@ -239,6 +246,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool hugetlb_free_vmemmap_enabled(void) +{ + return false; +} #endif static __always_inline int page_is_fake_head(struct page *page) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4977f5a520c22..791626983c2e1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,9 +188,9 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled __read_mostly = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); static int __init early_hugetlb_free_vmemmap_param(char *buf) { @@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL; @@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a9627dc784c3..0139b77c51d5d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1327,7 +1327,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && - !hugetlb_free_vmemmap_enabled && + !hugetlb_free_vmemmap_enabled() && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && From 9fef63ef5ac1311dfb0edc090f6e561af10f4948 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 27 Jan 2022 16:26:02 +1100 Subject: [PATCH 095/154] mm: sparsemem: use page table lock to protect kernel pmd operations The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free(). Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move. Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd. Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/ptdump.c | 16 +++++++++++---- mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/mm/ptdump.c b/mm/ptdump.c index da751448d0e4e..eea3d28d173c2 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index e881f5db70915..c64d1aa3c4b50 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,8 +53,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in pmd_install(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* From 695b2a044ec4f4afadd43139cfa19e301f5001c0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 27 Jan 2022 16:26:02 +1100 Subject: [PATCH 096/154] selftests: vm: add a hugetlb test case Since the head vmemmap page frame associated with each HugeTLB page is reused, we should hide the PG_head flag of tail struct page from the user. Add a tese case to check whether it is work properly. The test steps are as follows. 1) alloc 2MB hugeTLB 2) get each page frame 3) apply those APIs in each page frame 4) Those APIs work completely the same as before. Reading the flags of a page by /proc/kpageflags is done in stable_page_flags(), which has invoked PageHead(), PageTail(), PageCompound() and compound_head(). If those APIs work properly, the head page must have 15 and 17 bits set. And tail pages must have 16 and 17 bits set but 15 bit unset. Those flags are checked in check_page_flags(). Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 11 ++ 4 files changed, 157 insertions(+) create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 2e7e86e852828..3b5faec3c04f4 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -2,6 +2,7 @@ hugepage-mmap hugepage-mremap hugepage-shm +hugepage-vmemmap khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 1607322a112c9..7d100a7dc4624 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -31,6 +31,7 @@ TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += khugepaged TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 0000000000000..557bdbd4f87e8 --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 75d4017413944..e09040a3dc089 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -119,6 +119,17 @@ else echo "[PASS]" fi +echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." From 475f7c701c5154f2961d36fa75efbf6e2734a766 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 27 Jan 2022 16:26:02 +1100 Subject: [PATCH 097/154] mm: sparsemem: move vmemmap related to HugeTLB to CONFIG_HUGETLB_PAGE_FREE_VMEMMAP The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP. Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 ++ mm/sparse-vmemmap.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b1922..95d90970aec08 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3151,10 +3151,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c64d1aa3c4b50..8aecd6b3896c7 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map From 5c8a754f386008a8e0cf48ed208fafb91b221e6e Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 27 Jan 2022 16:26:02 +1100 Subject: [PATCH 098/154] mm/mempolicy: convert from atomic_t to refcount_t on mempolicy->refcnt refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Link: https://lkml.kernel.org/r/1626683671-64407-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Acked-by: Ben Widawsky Reviewed-by: Muchun Song Cc: Feng Tang Cc: Mike Kravetz Cc: Muchun Song Cc: Yanfei Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mempolicy.h | 5 +++-- mm/mempolicy.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d7..44383ab8af554 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -6,6 +6,7 @@ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 +#include #include #include #include @@ -42,7 +43,7 @@ struct mm_struct; * to 1, representing the caller of mpol_dup(). */ struct mempolicy { - atomic_t refcnt; + refcount_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ @@ -94,7 +95,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) static inline void mpol_get(struct mempolicy *pol) { if (pol) - atomic_inc(&pol->refcnt); + refcount_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 028e8dd82b442..d23d2c660de9b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -295,7 +295,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); - atomic_set(&policy->refcnt, 1); + refcount_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; @@ -306,7 +306,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!refcount_dec_and_test(&p->refcnt)) return; kmem_cache_free(policy_cache, p); } @@ -2409,7 +2409,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } - atomic_set(&new->refcnt, 1); + refcount_set(&new->refcnt, 1); return new; } @@ -2706,7 +2706,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, goto alloc_new; *mpol_new = *n->policy; - atomic_set(&mpol_new->refcnt, 1); + refcount_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); From 7bd9c10ce5ac6521bb7b6f1550e0ee29d43d1612 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 27 Jan 2022 16:26:03 +1100 Subject: [PATCH 099/154] mm-mempolicy-convert-from-atomic_t-to-refcount_t-on-mempolicy-refcnt-fix fix warnings mm/mempolicy.c:125:42: warning: missing braces around initializer [-Wmissing-braces] 125 | static struct mempolicy default_policy = { | ^ mm/mempolicy.c:125:42: warning: missing braces around initializer [-Wmissing-braces] mm/mempolicy.c: In function 'numa_policy_init': mm/mempolicy.c:2815:32: warning: missing braces around initializer [-Wmissing-braces] 2815 | preferred_node_policy[nid] = (struct mempolicy) { | ^ mm/mempolicy.c:2815:32: warning: missing braces around initializer [-Wmissing-braces] Cc: Xiyu Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d23d2c660de9b..a86590b2507da 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,7 +123,7 @@ enum zone_type policy_zone = 0; * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { - .refcnt = ATOMIC_INIT(1), /* never free it */ + .refcnt = { ATOMIC_INIT(1), }, /* never free it */ .mode = MPOL_LOCAL, }; @@ -2900,7 +2900,7 @@ void __init numa_policy_init(void) for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { - .refcnt = ATOMIC_INIT(1), + .refcnt = { ATOMIC_INIT(1), }, .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), From 3f2eb4a4240db58ad5581bc98157550c0c156281 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 27 Jan 2022 16:26:03 +1100 Subject: [PATCH 100/154] mm/cma: provide option to opt out from exposing pages on activation failure Patch series "powerpc/fadump: handle CMA activation failure appropriately", v3. Commit 072355c1cf2d ("mm/cma: expose all pages to the buddy if activation of an area fails") started exposing all pages to buddy allocator on CMA activation failure. But there can be CMA users that want to handle the reserved memory differently on CMA allocation failure. Provide an option to opt out from exposing pages to buddy for such cases. Link: https://lkml.kernel.org/r/20220117075246.36072-1-hbathini@linux.ibm.com Link: https://lkml.kernel.org/r/20220117075246.36072-2-hbathini@linux.ibm.com Signed-off-by: Hari Bathini Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Mike Kravetz Cc: Mahesh Salgaonkar Cc: Sourabh Jain Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/cma.h | 2 ++ mm/cma.c | 11 +++++++++-- mm/cma.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index bd801023504b2..51d540eee18ac 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -50,4 +50,6 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); + +extern void cma_reserve_pages_on_error(struct cma *cma); #endif diff --git a/mm/cma.c b/mm/cma.c index bc9ca8f3c4871..766f1b82b5322 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -131,8 +131,10 @@ static void __init cma_activate_area(struct cma *cma) bitmap_free(cma->bitmap); out_error: /* Expose all pages to the buddy, they are useless for CMA. */ - for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) - free_reserved_page(pfn_to_page(pfn)); + if (!cma->reserve_pages_on_error) { + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + } totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); @@ -150,6 +152,11 @@ static int __init cma_init_reserved_areas(void) } core_initcall(cma_init_reserved_areas); +void __init cma_reserve_pages_on_error(struct cma *cma) +{ + cma->reserve_pages_on_error = true; +} + /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area diff --git a/mm/cma.h b/mm/cma.h index 2c775877eae24..88a0595670b76 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -30,6 +30,7 @@ struct cma { /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif + bool reserve_pages_on_error; }; extern struct cma cma_areas[MAX_CMA_AREAS]; From 09710f817db56c89a3405b1df09a97bc8c7d8b79 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 27 Jan 2022 16:26:03 +1100 Subject: [PATCH 101/154] powerpc/fadump: opt out from freeing pages on cma activation failure With commit a4e92ce8e4c8 ("powerpc/fadump: Reservationless firmware assisted dump"), Linux kernel's Contiguous Memory Allocator (CMA) based reservation was introduced in fadump. That change was aimed at using CMA to let applications utilize the memory reserved for fadump while blocking it from being used for kernel pages. The assumption was, even if CMA activation fails for whatever reason, the memory still remains reserved to avoid it from being used for kernel pages. But commit 072355c1cf2d ("mm/cma: expose all pages to the buddy if activation of an area fails") breaks this assumption as it started exposing all pages to buddy allocator on CMA activation failure. It led to warning messages like below while running crash-utility on vmcore of a kernel having above two commits: crash: seek error: kernel virtual address: To fix this problem, opt out from exposing pages to buddy allocator on CMA activation failure for fadump reserved memory. Link: https://lkml.kernel.org/r/20220117075246.36072-3-hbathini@linux.ibm.com Signed-off-by: Hari Bathini Acked-by: David Hildenbrand Acked-by: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Mike Kravetz Cc: Oscar Salvador Cc: Sourabh Jain Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/kernel/fadump.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d03e488cfe9ca..d0ad86b67e665 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -112,6 +112,12 @@ static int __init fadump_cma_init(void) return 1; } + /* + * If CMA activation fails, keep the pages reserved, instead of + * exposing them to buddy allocator. Same as 'fadump=nocma' case. + */ + cma_reserve_pages_on_error(fadump_cma); + /* * So we now have successfully initialized cma area for fadump. */ From 8436f451c3322154d1f618b9d06f9a36b8bedb5d Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 27 Jan 2022 16:26:04 +1100 Subject: [PATCH 102/154] mm/vmstat: add event for ksm swapping in copy When faults in from swap what used to be a KSM page and that page had been swapped in before, system has to make a copy, and leaves remerging the pages to a later pass of ksmd. That is not good for performace, we'd better to reduce this kind of copy. There are some ways to reduce it, for example lessen swappiness or madvise(, , MADV_MERGEABLE) range. So add this event to support doing this tuning. Just like this patch: "mm, THP, swap: add THP swapping out fallback counting". Link: https://lkml.kernel.org/r/20220113023839.758845-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: Ran Xiaokai Cc: Hugh Dickins Cc: Yang Shi Cc: Dave Hansen Cc: Saravanan D Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/vm_event_item.h | 3 +++ mm/ksm.c | 3 +++ mm/vmstat.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 7b2363388bfa2..16a0a4fd000be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -129,6 +129,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#ifdef CONFIG_KSM + KSM_SWPIN_COPY, +#endif #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/mm/ksm.c b/mm/ksm.c index c20bd4d9a0d9e..4a7f8614e57d3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2595,6 +2595,9 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); +#ifdef CONFIG_SWAP + count_vm_event(KSM_SWPIN_COPY); +#endif } return new_page; diff --git a/mm/vmstat.c b/mm/vmstat.c index 4057372745d04..d2b9f27eb1c48 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1385,6 +1385,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", +#ifdef CONFIG_KSM + "ksm_swpin_copy", +#endif #endif #ifdef CONFIG_X86 "direct_map_level2_splits", From 7ed183bf833cf2f58f5ddf78e13774dcf9a9be93 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jan 2022 16:26:04 +1100 Subject: [PATCH 103/154] mm/balloon_compaction: make balloon page compaction callbacks static Since commit b1123ea6d3b3 ("mm: balloon: use general non-lru movable page feature"), these functions are called via balloon_aops callbacks. They're not called directly outside this file. So make them static and clean up the relevant code. Link: https://lkml.kernel.org/r/20220125132221.2220-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: Rafael Aquini Acked-by: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/balloon_compaction.h | 22 ---------------------- mm/balloon_compaction.c | 6 +++--- 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 338aa27e4773b..edb7f6d41faa0 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -80,12 +80,6 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) #ifdef CONFIG_BALLOON_COMPACTION extern const struct address_space_operations balloon_aops; -extern bool balloon_page_isolate(struct page *page, - isolate_mode_t mode); -extern void balloon_page_putback(struct page *page); -extern int balloon_page_migrate(struct address_space *mapping, - struct page *newpage, - struct page *page, enum migrate_mode mode); /* * balloon_page_insert - insert a page into the balloon's page list and make @@ -155,22 +149,6 @@ static inline void balloon_page_delete(struct page *page) list_del(&page->lru); } -static inline bool balloon_page_isolate(struct page *page) -{ - return false; -} - -static inline void balloon_page_putback(struct page *page) -{ - return; -} - -static inline int balloon_page_migrate(struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - return 0; -} - static inline gfp_t balloon_mapping_gfp_mask(void) { return GFP_HIGHUSER; diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 907fefde25728..4b8eab4b3f456 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -bool balloon_page_isolate(struct page *page, isolate_mode_t mode) +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); @@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode) return true; } -void balloon_page_putback(struct page *page) +static void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; @@ -230,7 +230,7 @@ void balloon_page_putback(struct page *page) /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct address_space *mapping, +static int balloon_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { From ffaeeac2ed48a5c7792f89bea72d57bba676ab90 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 27 Jan 2022 16:26:04 +1100 Subject: [PATCH 104/154] mm: fix race between MADV_FREE reclaim and blkdev direct IO read Problem: ======= Userspace might read the zero-page instead of actual data from a direct IO read on a block device if the buffers have been called madvise(MADV_FREE) on earlier (this is discussed below) due to a race between page reclaim on MADV_FREE and blkdev direct IO read. Race condition: ============== During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks if the page is not dirty, then discards its PTE (vs remap it back if the page is dirty). However, after try_to_unmap_one() returns to shrink_page_list(), it might keep the page _anyway_ if page_ref_freeze() fails (it expects a single page ref from the isolation). Well, blkdev_direct_IO() gets references for all pages, and on READ operations it sets them dirty later. So, if MADV_FREE pages (i.e., not dirty) are used as buffers (more later) for direct IO read from block devices and page reclaim runs during __blkdev_direct_IO[_simple]() AFTER bio_iov_iter_get_pages() but BEFORE it sets pages dirty, that situation happens. The direct IO read eventually completes. Now, when userspace reads the buffers, the PTE is no longer there and the page fault handler do_anonymous_page() services that with the zero-page, NOT the data! A synthetic reproducer is provided. Page faults: =========== The data read from the block device probably won't generate faults due to DMA (no MMU) but even in the case it wouldn't use DMA, that happens on different virtual addresses (not user-mapped addresses) because `struct bio_vec` stores `struct page` to figure addresses out (which are different from/unrelated to user-mapped addresses) for the data read. Thus userspace reads (to user-mapped addresses) still fault, then do_anonymous_page() gets another `struct page` that would address/ map to other memory than the `struct page` used by `struct bio_vec` for the read (which runs correctly as the page wasn't freed due to page_ref_freeze(), and is reclaimed later) -- but even if the page addresses matched, that handler maps the zero-page in the PTE, not that page's memory (on read faults.) If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't happen, because that faults-in all pages as writeable, so do_anonymous_page() sets up a new page/rmap/PTE, and that is used by direct IO. The userspace reads don't fault as the PTE is there (thus zero-page is not used.) Solution: ======== One solution is to check for the expected page reference count in try_to_unmap_one() too, which should be exactly two: one from the isolation (checked by shrink_page_list()), and the other from the rmap (dropped by the discard: label). If that doesn't match, then remap the PTE back, just like page dirty does. The new check in try_to_unmap_one() should be safe in races with bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's done under the PTE lock. The fast path doesn't take that lock but it checks the PTE has changed, then drops the reference and leaves the page for the slow path (which does take that lock). - try_to_unmap_one() - page_vma_mapped_walk() - map_pte() # see pte_offset_map_lock(): pte_offset_map() spin_lock() - page_ref_count() # new check - page_vma_mapped_walk_done() # see pte_unmap_unlock(): pte_unmap() spin_unlock() - bio_iov_iter_get_pages() - __bio_iov_iter_get_pages() - iov_iter_get_pages() - get_user_pages_fast() - internal_get_user_pages_fast() # fast path - lockless_pages_from_mm() - gup_{pgd,p4d,pud,pmd,pte}_range() ptep = pte_offset_map() # not _lock() pte = ptep_get_lockless(ptep) page = pte_page(pte) try_grab_compound_head(page) # get ref if (pte_val(pte) != pte_val(*ptep)) put_compound_head(page) # put ref # leave page for slow path # slow path - __gup_longterm_unlocked() - get_user_pages_unlocked() - __get_user_pages_locked() - __get_user_pages() - follow_{page,p4d,pud,pmd}_mask() - follow_page_pte() ptep = pte_offset_map_lock() pte = *ptep page = vm_normal_page(pte) try_grab_page(page) # get ref pte_unmap_unlock() Regarding transparent hugepages, that number shouldn't change, as MADV_FREE (aka lazyfree) pages are PageAnon() && !PageSwapBacked() (madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn()) thus should reach shrink_page_list() -> split_huge_page_to_list() before try_to_unmap[_one](), so it deals with normal pages only. (And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens, which it should not or be rare, the page refcount is not two, as the head page counts tail pages, and tail pages have zero. That also prevents checking the head `page` then incorrectly call page_remove_rmap(subpage) for a tail page, that isn't even in the shrink_page_list()'s page_list (an effect of split huge pmd/pmvw), as it might happen today in this unlikely scenario.) MADV_FREE'd buffers: =================== So, back to the "if MADV_FREE pages are used as buffers" note. The case is arguable, and subject to multiple interpretations. The madvise(2) manual page on the MADV_FREE advice value says: - 'After a successful MADV_FREE ... data will be lost when the kernel frees the pages.' - 'the free operation will be canceled if the caller writes into the page' / 'subsequent writes ... will succeed and then [the] kernel cannot free those dirtied pages' - 'If there is no subsequent write, the kernel can free the pages at any time.' Thoughts, questions, considerations... - Since the kernel didn't actually free the page (page_ref_freeze() failed), should the data not have been lost? (on userspace read.) - Should writes performed by the direct IO read be able to cancel the free operation? - Should the direct IO read be considered as 'the caller' too, as it's been requested by 'the caller'? - Should the bio technique to dirty pages on return to userspace (bio_check_pages_dirty() is called/used by __blkdev_direct_IO()) be considered in another/special way here? - Should an upcoming write from a previously requested direct IO read be considered as a subsequent write, so the kernel should not free the pages? (as it's known at the time of page reclaim.) Technically, the last point would seem a reasonable consideration and balance, as the madvise(2) manual page apparently (and fairly) seem to assume that 'writes' are memory access from the userspace process (not explicitly considering writes from the kernel or its corner cases; again, fairly).. plus the kernel fix implementation for the corner case of the largely 'non-atomic write' encompassed by a direct IO read operation, is relatively simple; and it helps. Reproducer: ========== @ test.c (simplified, but works) #define _GNU_SOURCE #include #include #include #include int main() { int fd, i; char *buf; fd = open(DEV, O_RDONLY | O_DIRECT); buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) buf[i] = 1; // init to non-zero madvise(buf, BUF_SIZE, MADV_FREE); read(fd, buf, BUF_SIZE); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) printf("%p: 0x%x ", &buf[i], buf[i]); return 0; } @ block/fops.c (formerly fs/block_dev.c) +#include ... ... __blkdev_direct_IO[_simple](...) { ... + if (!strcmp(current->comm, "good")) + shrink_all_memory(ULONG_MAX); + ret = bio_iov_iter_get_pages(...); + + if (!strcmp(current->comm, "bad")) + shrink_all_memory(ULONG_MAX); ... } @ shell # yes | dd of=test.img bs=1k count=16 # DEV=$(losetup -f --show test.img) # gcc -DDEV=\"$DEV\" -DBUF_SIZE=16384 -DPAGE_SIZE=4096 test.c -o test # od -tx1 $DEV 0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a * 0040000 # mv test good # ./good 0x7f1509206000: 0x79 0x7f1509207000: 0x79 0x7f1509208000: 0x79 0x7f1509209000: 0x79 # mv good bad # ./bad 0x7fd87272f000: 0x0 0x7fd872730000: 0x0 0x7fd872731000: 0x0 0x7fd872732000: 0x0 Ceph/TCMalloc: ============= For documentation purposes, the use case driving the analysis/fix is Ceph on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to release unused memory to the system from the mmap'ed page heap (might be committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan() -> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() -> TCMalloc_SystemCommit() -> do nothing. Note: TCMalloc switched back to MADV_DONTNEED a few commits after the release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue just 'disappeared' on Ceph on later Ubuntu releases but is still present in the kernel, and can be hit by other use cases. The observed issue seems to be the old Ceph bug #22464 [1], where checksum mismatches are observed (and instrumentation with buffer dumps shows zero-pages read from mmap'ed/MADV_FREE'd page ranges). The issue in Ceph was reasonably deemed a kernel bug (comment #50) and mostly worked around with a retry mechanism, but other parts of Ceph could still hit that (rocksdb). Anyway, it's less likely to be hit again as TCMalloc switched out of MADV_FREE by default. (Some kernel versions/reports from the Ceph bug, and relation with the MADV_FREE introduction/changes; TCMalloc versions not checked.) - 4.4 good - 4.5 (madv_free: introduction) - 4.9 bad - 4.10 good? maybe a swapless system - 4.12 (madv_free: no longer free instantly on swapless systems) - 4.13 bad [1] https://tracker.ceph.com/issues/22464 Thanks: ====== Several people contributed to analysis/discussions/tests/reproducers in the first stages when drilling down on ceph/tcmalloc/linux kernel: - Dan Hill - Dan Streetman - Dongdong Tao - Gavin Guo - Gerald Yang - Heitor Alves de Siqueira - Ioanna Alifieraki - Jay Vosburgh - Matthew Ruffell - Ponnuvel Palaniyappan Link: https://lkml.kernel.org/r/20211211022115.1547617-1-mfo@canonical.com Signed-off-by: Mauricio Faria de Oliveira Cc: Dan Hill Cc: Dan Streetman Cc: Dongdong Tao Cc: Gavin Guo Cc: Gerald Yang Cc: Heitor Alves de Siqueira Cc: Ioanna Alifieraki Cc: Jay Vosburgh Cc: Matthew Ruffell Cc: Ponnuvel Palaniyappan Cc: Minchan Kim Cc: Huang Ying Cc: Miaohe Lin Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/rmap.c | 13 ++++++++++++- mm/vmscan.c | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 6a1e8c7f62136..1882ee0780e31 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1599,7 +1599,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { - if (!PageDirty(page)) { + int refcount = page_ref_count(page); + + /* + * The only page refs must be from the isolation + * (checked by the caller shrink_page_list() too) + * and the (single) rmap (dropped by discard:). + * + * Check the reference count before dirty flag + * with memory barrier; see __remove_mapping(). + */ + smp_rmb(); + if (refcount == 2 && !PageDirty(page)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); diff --git a/mm/vmscan.c b/mm/vmscan.c index 090bfb605ecf0..0dbfa3a695675 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1729,7 +1729,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, mapping = page_mapping(page); } } else if (unlikely(PageTransHuge(page))) { - /* Split file THP */ + /* Split file/lazyfree THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } From e1f12069eb336333d3ddf7977aaeb9a375df4a0e Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 27 Jan 2022 16:26:04 +1100 Subject: [PATCH 105/154] mm/rmap: convert from atomic_t to refcount_t on anon_vma->refcount refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Link: https://lkml.kernel.org/r/1626665029-49104-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Cc: Alistair Popple Cc: Yang Shi Cc: Shakeel Butt Cc: Hugh Dickins Cc: Xiyu Yang Cc: Miaohe Lin Cc: Cc: Xin Tan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/rmap.h | 8 +++++--- mm/rmap.c | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e704b1a4c06c0..221c3c6438a78 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,8 @@ #include #include +#include + /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: @@ -36,7 +38,7 @@ struct anon_vma { * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ - atomic_t refcount; + refcount_t refcount; /* * Count of child anon_vmas and VMAs which points to this anon_vma. @@ -100,14 +102,14 @@ enum ttu_flags { #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { - atomic_inc(&anon_vma->refcount); + refcount_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { - if (atomic_dec_and_test(&anon_vma->refcount)) + if (refcount_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index 1882ee0780e31..b0fd9dc19eba8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -89,7 +89,7 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { - atomic_set(&anon_vma->refcount, 1); + refcount_set(&anon_vma->refcount, 1); anon_vma->degree = 1; /* Reference for first vma */ anon_vma->parent = anon_vma; /* @@ -104,7 +104,7 @@ static inline struct anon_vma *anon_vma_alloc(void) static inline void anon_vma_free(struct anon_vma *anon_vma) { - VM_BUG_ON(atomic_read(&anon_vma->refcount)); + VM_BUG_ON(refcount_read(&anon_vma->refcount)); /* * Synchronize against page_lock_anon_vma_read() such that @@ -446,7 +446,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); - atomic_set(&anon_vma->refcount, 0); + refcount_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } @@ -496,7 +496,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -555,7 +555,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } /* trylock failed, we got to sleep */ - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -570,7 +570,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) rcu_read_unlock(); anon_vma_lock_read(anon_vma); - if (atomic_dec_and_test(&anon_vma->refcount)) { + if (refcount_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because @@ -2268,7 +2268,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); - if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + if (root != anon_vma && refcount_dec_and_test(&root->refcount)) anon_vma_free(root); } From d71ed116cfcc5f3f5c53654f8ad54d899e1187cb Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Thu, 27 Jan 2022 16:26:05 +1100 Subject: [PATCH 106/154] mm/zswap.c: allow handling just same-value filled pages Zswap has an ability to efficiently store same-value filled pages, which can be turned on and off using the "same_filled_pages_enabled" parameter. However, there is currently no way to enable just this (lightweight) functionality, while not making use of the whole compressed page storage machinery. Add a "non_same_filled_pages_enabled" parameter which allows disabling handling of pages that aren't same-value filled. This way zswap can be run in such lightweight same-value filled pages only mode. Link: https://lkml.kernel.org/r/7dbafa963e8bab43608189abbe2067f4b9287831.1641247624.git.maciej.szmigiero@oracle.com Signed-off-by: Maciej S. Szmigiero Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/mm/zswap.rst | 22 +++++++++++++++++++--- mm/zswap.c | 15 ++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 8edb8d578caf7..6e6f7b0d6562b 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -130,9 +130,25 @@ attribute, e.g.:: echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled When zswap same-filled page identification is disabled at runtime, it will stop -checking for the same-value filled pages during store operation. However, the -existing pages which are marked as same-value filled pages remain stored -unchanged in zswap until they are either loaded or invalidated. +checking for the same-value filled pages during store operation. +In other words, every page will be then considered non-same-value filled. +However, the existing pages which are marked as same-value filled pages remain +stored unchanged in zswap until they are either loaded or invalidated. + +In some circumstances it might be advantageous to make use of just the zswap +ability to efficiently store same-filled pages without enabling the whole +compressed page storage. +In this case the handling of non-same-value pages by zswap (enabled by default) +can be disabled by setting the ``non_same_filled_pages_enabled`` attribute +to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``. +It can also be enabled and disabled at runtime using the sysfs +``non_same_filled_pages_enabled`` attribute, e.g.:: + + echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled + +Disabling both ``zswap.same_filled_pages_enabled`` and +``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new +pages by zswap. To prevent zswap from shrinking pool when zswap is full and there's a high pressure on swap (this will result in flipping pages in and out zswap pool diff --git a/mm/zswap.c b/mm/zswap.c index cdf6950fcb2e3..3efd8cae315e7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -120,11 +120,19 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* Enable/disable handling same-value filled pages (enabled by default) */ +/* + * Enable/disable handling same-value filled pages (enabled by default). + * If disabled every page is considered non-same-value filled. + */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); +/* Enable/disable handling non-same-value filled pages (enabled by default) */ +static bool zswap_non_same_filled_pages_enabled = true; +module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -1147,6 +1155,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, kunmap_atomic(src); } + if (!zswap_non_same_filled_pages_enabled) { + ret = -EINVAL; + goto freepage; + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { From e3692d5e92dc7957279f1c2eaf02808650389397 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 27 Jan 2022 16:26:05 +1100 Subject: [PATCH 107/154] highmem: document kunmap_local() Some users of kmap() add an offset to the kmap() address to be used during the mapping. When converting to kmap_local_page() the base address does not need to be stored because any address within the page can be used in kunmap_local(). However, this was not clear from the documentation and cause some questions.[1] Document that any address in the page can be used in kunmap_local() to clarify this for future users. [1] https://lore.kernel.org/lkml/20211213154543.GM3538886@iweiny-DESK2.sc.intel.com/ Link: https://lkml.kernel.org/r/20220124013045.806718-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem-internal.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 0a0b2b09b1b8d..fb2d3e033c013 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -246,6 +246,17 @@ do { \ __kunmap_atomic(__addr); \ } while (0) +/** + * kunmap_local - Unmap a page mapped via kmap_local_page(). + * @__addr: An address within the page mapped + * + * __addr is often an address returned from kmap_local_page(). However, + * this address can be any address within the mapped page. It does not need to + * be the exact address returned from kmap_local_page() + * + * Unmapping should be done in the reverse order of the mapping. See + * kmap_local_page() for details. + */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ From 43a22a907d084c67102be51ab9032c990c98784f Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 27 Jan 2022 16:26:05 +1100 Subject: [PATCH 108/154] highmem-document-kunmap_local-v2 updates per Christoph Link: https://lkml.kernel.org/r/20220124182138.816693-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem-internal.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index fb2d3e033c013..a77be56302094 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -250,9 +250,8 @@ do { \ * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * - * __addr is often an address returned from kmap_local_page(). However, - * this address can be any address within the mapped page. It does not need to - * be the exact address returned from kmap_local_page() + * @__addr can be any address within the mapped page. Commonly it is the + * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. From 929fcee239b3b9d5c251cc410311a80c4fd8776a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jan 2022 16:26:05 +1100 Subject: [PATCH 109/154] mm/highmem: remove unnecessary done label Remove unnecessary done label to simplify the code. Link: https://lkml.kernel.org/r/20220126092542.64659-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/highmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 762679050c9a0..0cc0c4da7ed9f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -736,11 +736,11 @@ void *page_address(const struct page *page) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + spin_unlock_irqrestore(&pas->lock, flags); return ret; } @@ -773,13 +773,12 @@ void set_page_address(struct page *page, void *virtual) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - goto done; + break; } } spin_unlock_irqrestore(&pas->lock, flags); } -done: + return; } From 4e337b4a7a4f26305dbeffbca9ed8615eb152fc7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jan 2022 16:26:06 +1100 Subject: [PATCH 110/154] mm/hmm.c: remove unneeded local variable ret The local variable ret is always 0. Remove it to make code more tight. Link: https://lkml.kernel.org/r/20220125124833.39718-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hmm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index bd56641c79d4e..af71aac3140e4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -417,7 +417,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned long addr = start; pud_t pud; - int ret = 0; spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); if (!ptl) @@ -466,7 +465,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, out_unlock: spin_unlock(ptl); - return ret; + return 0; } #else #define hmm_vma_walk_pud NULL From 967de868fdf6cf1bfc510551de4dc136f1faa160 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 27 Jan 2022 16:26:06 +1100 Subject: [PATCH 111/154] mm/damon/dbgfs/init_regions: use target index instead of target id Patch series "Remove the type-unclear target id concept". DAMON asks each monitoring target ('struct damon_target') to have one 'unsigned long' integer called 'id', which should be unique among the targets of same monitoring context. Meaning of it is, however, totally up to the monitoring primitives that registered to the monitoring context. For example, the virtual address spaces monitoring primitives treats the id as a 'struct pid' pointer. This makes the code flexible but ugly, not well-documented, and type-unsafe[1]. Also, identification of each target can be done via its index. For the reason, this patchset removes the concept and uses clear type definition. [1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/ This patch (of 4): Target id is a 'unsigned long' data, which can be interpreted differently by each monitoring primitives. For example, it means 'struct pid *' for the virtual address spaces monitoring, while it means nothing but an integer to be displayed to debugfs interface users for the physical address space monitoring. It's flexible but makes code ugly and type-unsafe[1]. To be prepared for eventual removal of the concept, this commit removes a use case of the concept in 'init_regions' debugfs file handling. In detail, this commit replaces use of the id with the index of each target in the context's targets list. [1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211230100723.2238-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211230100723.2238-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs-test.h | 20 ++++++++++---------- mm/damon/dbgfs.c | 25 ++++++++++++------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 86b9f9528231e..00bff058fe08f 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -113,19 +113,19 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); unsigned long ids[] = {1, 2, 3}; - /* Each line represents one region in `` `` */ - char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", - "2 10 20\n", - "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", + "1 10 20\n", + "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", ""}; /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", - "2 10 20\n", - "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", + "1 10 20\n", + "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", ""}; - char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ - "2 10 20\n 2 14 26\n", /* regions overlap */ - "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ + "1 10 20\n 1 14 26\n", /* regions overlap */ + "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ char *input, *expect; int i, rc; char buf[256]; diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 5b899601e56c3..3f65af04e4e60 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -440,18 +440,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) { struct damon_target *t; struct damon_region *r; + int target_idx = 0; int written = 0; int rc; damon_for_each_target(t, c) { damon_for_each_region(r, t) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %lu\n", - t->id, r->ar.start, r->ar.end); + "%d %lu %lu\n", + target_idx, r->ar.start, r->ar.end); if (!rc) return -ENOMEM; written += rc; } + target_idx++; } return written; } @@ -485,22 +487,19 @@ static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, return len; } -static int add_init_region(struct damon_ctx *c, - unsigned long target_id, struct damon_addr_range *ar) +static int add_init_region(struct damon_ctx *c, int target_idx, + struct damon_addr_range *ar) { struct damon_target *t; struct damon_region *r, *prev; - unsigned long id; + unsigned long idx = 0; int rc = -EINVAL; if (ar->start >= ar->end) return -EINVAL; damon_for_each_target(t, c) { - id = t->id; - if (targetid_is_pid(c)) - id = (unsigned long)pid_vnr((struct pid *)id); - if (id == target_id) { + if (idx++ == target_idx) { r = damon_new_region(ar->start, ar->end); if (!r) return -ENOMEM; @@ -523,7 +522,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) struct damon_target *t; struct damon_region *r, *next; int pos = 0, parsed, ret; - unsigned long target_id; + int target_idx; struct damon_addr_range ar; int err; @@ -533,11 +532,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) } while (pos < len) { - ret = sscanf(&str[pos], "%lu %lu %lu%n", - &target_id, &ar.start, &ar.end, &parsed); + ret = sscanf(&str[pos], "%d %lu %lu%n", + &target_idx, &ar.start, &ar.end, &parsed); if (ret != 3) break; - err = add_init_region(c, target_id, &ar); + err = add_init_region(c, target_idx, &ar); if (err) goto fail; pos += parsed; From 717ac6f1bc297d9a1ba501d0d36848b962713fcc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 27 Jan 2022 16:26:06 +1100 Subject: [PATCH 112/154] Docs/admin-guide/mm/damon/usage: update for changed initail_regions file input A previous commit made init_regions debugfs file to use target index instead of target id for specifying the target of the init regions. This commit updates the usage document to reflect the change. Link: https://lkml.kernel.org/r/20211230100723.2238-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/mm/damon/usage.rst | 24 ++++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 59b84904a8543..1e06435b8ff67 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -108,19 +108,23 @@ In such cases, users can explicitly set the initial monitoring target regions as they want, by writing proper values to the ``init_regions`` file. Each line of the input should represent one region in below form.:: - + -The ``target id`` should already in ``target_ids`` file, and the regions should -be passed in address order. For example, below commands will set a couple of -address ranges, ``1-100`` and ``100-200`` as the initial monitoring target -region of process 42, and another couple of address ranges, ``20-40`` and -``50-100`` as that of process 4242.:: +The ``target idx`` should be the index of the target in ``target_ids`` file, +starting from ``0``, and the regions should be passed in address order. For +example, below commands will set a couple of address ranges, ``1-100`` and +``100-200`` as the initial monitoring target region of pid 42, which is the +first one (index ``0``) in ``target_ids``, and another couple of address +ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one +(index ``1``) in ``target_ids``.:: # cd /damon - # echo "42 1 100 - 42 100 200 - 4242 20 40 - 4242 50 100" > init_regions + # cat target_ids + 42 4242 + # echo "0 1 100 + 0 100 200 + 1 20 40 + 1 50 100" > init_regions Note that this sets the initial monitoring target regions only. In case of virtual memory monitoring, DAMON will automatically updates the boundary of the From 94ccaf24bca598f8a7398739d240852c5b7b1327 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 27 Jan 2022 16:26:07 +1100 Subject: [PATCH 113/154] mm/damon/core: move damon_set_targets() into dbgfs damon_set_targets() function is defined in the core for general use cases, but called from only dbgfs. Also, because the function is for general use cases, dbgfs does additional handling of pid type target id case. To make the situation simpler, this commit moves the function into dbgfs and makes it to do the pid type case handling on its own. Link: https://lkml.kernel.org/r/20211230100723.2238-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 2 -- mm/damon/core-test.h | 5 +++- mm/damon/core.c | 32 -------------------------- mm/damon/dbgfs-test.h | 14 ++++++------ mm/damon/dbgfs.c | 53 +++++++++++++++++++++++++++++++++---------- 5 files changed, 52 insertions(+), 54 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5e1e3a128b77a..bd021af5db3d1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,8 +484,6 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 7008c3735e99f..4a6141ddd6fcf 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -86,7 +86,10 @@ static void damon_test_aggregate(struct kunit *test) struct damon_region *r; int it, ir; - damon_set_targets(ctx, target_ids, 3); + for (it = 0; it < 3; it++) { + t = damon_new_target(target_ids[it]); + damon_add_target(ctx, t); + } it = 0; damon_for_each_target(t, ctx) { diff --git a/mm/damon/core.c b/mm/damon/core.c index 1dd153c31c9e2..3fef5c667a31d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -245,38 +245,6 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } -/** - * damon_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @ids: array of target ids - * @nr_ids: number of entries in @ids - * - * This function should not be called while the kdamond is running. - * - * Return: 0 on success, negative error code otherwise. - */ -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_destroy_targets(ctx); - - for (i = 0; i < nr_ids; i++) { - t = damon_new_target(ids[i]); - if (!t) { - /* The caller should do cleanup of the ids itself */ - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - return -ENOMEM; - } - damon_add_target(ctx, t); - } - - return 0; -} - /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 00bff058fe08f..c1c988b607bc9 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -86,23 +86,23 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) ctx->primitive.target_valid = NULL; ctx->primitive.cleanup = NULL; - damon_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, ids, 3); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, NULL, 0); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - damon_set_targets(ctx, (unsigned long []){1, 2}, 2); + dbgfs_set_targets(ctx, (unsigned long []){1, 2}, 2); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); - damon_set_targets(ctx, (unsigned long []){2}, 1); + dbgfs_set_targets(ctx, (unsigned long []){2}, 1); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, NULL, 0); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); @@ -130,7 +130,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; - damon_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, ids, 3); /* Put valid inputs and check the results */ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { @@ -158,7 +158,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, ""); } - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, NULL, 0); damon_destroy_ctx(ctx); } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 3f65af04e4e60..58867b9666350 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -358,11 +358,48 @@ static void dbgfs_put_pids(unsigned long *ids, int nr_ids) put_pid((struct pid *)ids[i]); } +/* + * dbgfs_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @ids: array of target ids + * @nr_ids: number of entries in @ids + * + * This function should not be called while the kdamond is running. + * + * Return: 0 on success, negative error code otherwise. + */ +static int dbgfs_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (targetid_is_pid(ctx)) + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } + + for (i = 0; i < nr_ids; i++) { + t = damon_new_target(ids[i]); + if (!t) { + /* The caller should do cleanup of the ids itself */ + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + if (targetid_is_pid(ctx)) + dbgfs_put_pids(ids, nr_ids); + return -ENOMEM; + } + damon_add_target(ctx, t); + } + + return 0; +} + static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - struct damon_target *t, *next_t; bool id_is_pid = true; char *kbuf; unsigned long *targets; @@ -407,11 +444,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, } /* remove previously set targets */ - damon_for_each_target_safe(t, next_t, ctx) { - if (targetid_is_pid(ctx)) - put_pid((struct pid *)t->id); - damon_destroy_target(t); - } + dbgfs_set_targets(ctx, NULL, 0); /* Configure the context for the address space type */ if (id_is_pid) @@ -419,13 +452,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file, else damon_pa_set_primitives(ctx); - ret = damon_set_targets(ctx, targets, nr_targets); - if (ret) { - if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); - } else { + ret = dbgfs_set_targets(ctx, targets, nr_targets); + if (!ret) ret = count; - } unlock_out: mutex_unlock(&ctx->kdamond_lock); From bd90901718c305b2f0eee76da325f8ca38ab32b4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 27 Jan 2022 16:26:07 +1100 Subject: [PATCH 114/154] mm/damon: remove the target id concept DAMON asks each monitoring target ('struct damon_target') to have one 'unsigned long' integer called 'id', which should be unique among the targets of same monitoring context. Meaning of it is, however, totally up to the monitoring primitives that registered to the monitoring context. For example, the virtual address spaces monitoring primitives treats the id as a 'struct pid' pointer. This makes the code flexible, but ugly, not well-documented, and type-unsafe[1]. Also, identification of each target can be done via its index. For the reason, this commit removes the concept and uses clear type definition. For now, only 'struct pid' pointer is used for the virtual address spaces monitoring. If DAMON is extended in future so that we need to put another identifier field in the struct, we will use a union for such primitives-dependent fields and document which primitives are using which type. [1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211230100723.2238-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 11 ++- mm/damon/core-test.h | 18 +++-- mm/damon/core.c | 4 +- mm/damon/dbgfs-test.h | 63 ++++++----------- mm/damon/dbgfs.c | 152 +++++++++++++++++++++++++----------------- mm/damon/reclaim.c | 3 +- mm/damon/vaddr-test.h | 6 +- mm/damon/vaddr.c | 4 +- 8 files changed, 133 insertions(+), 128 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index bd021af5db3d1..7c1d915b35875 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -60,19 +60,18 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. - * @id: Unique identifier for this target. + * @pid: The PID of the virtual address space to monitor. * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The - * @id of each target should be unique among the targets of the context. For - * example, in the virtual address monitoring context, it could be a pidfd or - * an address of an mm_struct. + * @pid should be set for appropriate address space monitoring primitives + * including the virtual address spaces monitoring primitives. */ struct damon_target { - unsigned long id; + struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; @@ -475,7 +474,7 @@ struct damos *damon_new_scheme( void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -struct damon_target *damon_new_target(unsigned long id); +struct damon_target *damon_new_target(void); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 4a6141ddd6fcf..b4085deb9fa05 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test) KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); - t = damon_new_target(42); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_add_region(r, t); @@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test) struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; - t = damon_new_target(42); - KUNIT_EXPECT_EQ(test, 42ul, t->id); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_add_target(c, t); @@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test) static void damon_test_aggregate(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long target_ids[] = {1, 2, 3}; unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; @@ -87,7 +85,7 @@ static void damon_test_aggregate(struct kunit *test) int it, ir; for (it = 0; it < 3; it++) { - t = damon_new_target(target_ids[it]); + t = damon_new_target(); damon_add_target(ctx, t); } @@ -125,7 +123,7 @@ static void damon_test_split_at(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); damon_add_region(r, t); damon_split_region_at(c, t, r, 25); @@ -146,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test) struct damon_region *r, *r2, *r3; int i; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); r->nr_accesses = 10; damon_add_region(r, t); @@ -194,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test) unsigned long eaddrs[] = {112, 130, 156, 170, 230}; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); r->nr_accesses = nrs[i]; @@ -218,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); damon_split_regions_of(c, t, 2); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); damon_split_regions_of(c, t, 4); diff --git a/mm/damon/core.c b/mm/damon/core.c index 3fef5c667a31d..bf495236d741b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -144,7 +144,7 @@ void damon_destroy_scheme(struct damos *s) * * Returns the pointer to the new struct if success, or NULL otherwise */ -struct damon_target *damon_new_target(unsigned long id) +struct damon_target *damon_new_target(void) { struct damon_target *t; @@ -152,7 +152,7 @@ struct damon_target *damon_new_target(unsigned long id) if (!t) return NULL; - t->id = id; + t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index c1c988b607bc9..0d3a14c00acfb 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -12,66 +12,58 @@ #include -static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +static void damon_dbgfs_test_str_to_ints(struct kunit *test) { char *question; - unsigned long *answers; - unsigned long expected[] = {12, 35, 46}; + int *answers; + int expected[] = {12, 35, 46}; ssize_t nr_integers = 0, i; question = "123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "123abc"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "a123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "12 35"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 abc 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < 2; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = ""; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "\n"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); } @@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) static void damon_dbgfs_test_set_targets(struct kunit *test) { struct damon_ctx *ctx = dbgfs_new_ctx(); - unsigned long ids[] = {1, 2, 3}; char buf[64]; - /* Make DAMON consider target id as plain number */ - ctx->primitive.target_valid = NULL; - ctx->primitive.cleanup = NULL; + /* Make DAMON consider target has no pid */ + ctx->primitive = (struct damon_primitive){}; - dbgfs_set_targets(ctx, ids, 3); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); - - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - dbgfs_set_targets(ctx, (unsigned long []){1, 2}, 2); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); - - dbgfs_set_targets(ctx, (unsigned long []){2}, 1); + dbgfs_set_targets(ctx, 1, NULL); sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); + KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); @@ -112,7 +94,6 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) static void damon_dbgfs_test_set_init_regions(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long ids[] = {1, 2, 3}; /* Each line represents one region in `` `` */ char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", "1 10 20\n", @@ -130,7 +111,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; - dbgfs_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, 3, NULL); /* Put valid inputs and check the results */ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { @@ -158,12 +139,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, ""); } - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); damon_destroy_ctx(ctx); } static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_str_to_ints), KUNIT_CASE(damon_dbgfs_test_set_targets), KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 58867b9666350..78ff645433c64 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -275,7 +275,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, return ret; } -static inline bool targetid_is_pid(const struct damon_ctx *ctx) +static inline bool target_has_pid(const struct damon_ctx *ctx) { return ctx->primitive.target_valid == damon_va_target_valid; } @@ -283,17 +283,19 @@ static inline bool targetid_is_pid(const struct damon_ctx *ctx) static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; - unsigned long id; + int id; int written = 0; int rc; damon_for_each_target(t, ctx) { - id = t->id; - if (targetid_is_pid(ctx)) + if (target_has_pid(ctx)) /* Show pid numbers to debugfs users */ - id = (unsigned long)pid_vnr((struct pid *)id); + id = pid_vnr(t->pid); + else + /* Show 42 for physical address space, just for fun */ + id = 42; - rc = scnprintf(&buf[written], len - written, "%lu ", id); + rc = scnprintf(&buf[written], len - written, "%d ", id); if (!rc) return -ENOMEM; written += rc; @@ -321,75 +323,114 @@ static ssize_t dbgfs_target_ids_read(struct file *file, } /* - * Converts a string into an array of unsigned long integers + * Converts a string into an integers array * - * Returns an array of unsigned long integers if the conversion success, or - * NULL otherwise. + * Returns an array of integers array if the conversion success, or NULL + * otherwise. */ -static unsigned long *str_to_target_ids(const char *str, ssize_t len, - ssize_t *nr_ids) +static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) { - unsigned long *ids; - const int max_nr_ids = 32; - unsigned long id; + int *array; + const int max_nr_ints = 32; + int nr; int pos = 0, parsed, ret; - *nr_ids = 0; - ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); - if (!ids) + *nr_ints = 0; + array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); + if (!array) return NULL; - while (*nr_ids < max_nr_ids && pos < len) { - ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + while (*nr_ints < max_nr_ints && pos < len) { + ret = sscanf(&str[pos], "%d%n", &nr, &parsed); pos += parsed; if (ret != 1) break; - ids[*nr_ids] = id; - *nr_ids += 1; + array[*nr_ints] = nr; + *nr_ints += 1; } - return ids; + return array; } -static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +static void dbgfs_put_pids(struct pid **pids, int nr_pids) { int i; - for (i = 0; i < nr_ids; i++) - put_pid((struct pid *)ids[i]); + for (i = 0; i < nr_pids; i++) + put_pid(pids[i]); +} + +/* + * Converts a string into an struct pid pointers array + * + * Returns an array of struct pid pointers if the conversion success, or NULL + * otherwise. + */ +static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) +{ + int *ints; + ssize_t nr_ints; + struct pid **pids; + + *nr_pids = 0; + + ints = str_to_ints(str, len, &nr_ints); + if (!ints) + return NULL; + + pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); + if (!pids) + goto out; + + for (; *nr_pids < nr_ints; (*nr_pids)++) { + pids[*nr_pids] = find_get_pid(ints[*nr_pids]); + if (!pids[*nr_pids]) { + dbgfs_put_pids(pids, *nr_pids); + kfree(ints); + kfree(pids); + return NULL; + } + } + +out: + kfree(ints); + return pids; } /* * dbgfs_set_targets() - Set monitoring targets. * @ctx: monitoring context - * @ids: array of target ids - * @nr_ids: number of entries in @ids + * @nr_targets: number of targets + * @pids: array of target pids (size is same to @nr_targets) * - * This function should not be called while the kdamond is running. + * This function should not be called while the kdamond is running. @pids is + * ignored if the context is not configured to have pid in each target. On + * failure, reference counts of all pids in @pids are decremented. * * Return: 0 on success, negative error code otherwise. */ -static int dbgfs_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids) +static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, + struct pid **pids) { ssize_t i; struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (targetid_is_pid(ctx)) - put_pid((struct pid *)t->id); + if (target_has_pid(ctx)) + put_pid(t->pid); damon_destroy_target(t); } - for (i = 0; i < nr_ids; i++) { - t = damon_new_target(ids[i]); + for (i = 0; i < nr_targets; i++) { + t = damon_new_target(); if (!t) { - /* The caller should do cleanup of the ids itself */ damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); - if (targetid_is_pid(ctx)) - dbgfs_put_pids(ids, nr_ids); + if (target_has_pid(ctx)) + dbgfs_put_pids(pids, nr_targets); return -ENOMEM; } + if (target_has_pid(ctx)) + t->pid = pids[i]; damon_add_target(ctx, t); } @@ -402,10 +443,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file, struct damon_ctx *ctx = file->private_data; bool id_is_pid = true; char *kbuf; - unsigned long *targets; + struct pid **target_pids = NULL; ssize_t nr_targets; ssize_t ret; - int i; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -413,38 +453,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; - /* target id is meaningless here, but we set it just for fun */ - scnprintf(kbuf, count, "42 "); - } - - targets = str_to_target_ids(kbuf, count, &nr_targets); - if (!targets) { - ret = -ENOMEM; - goto out; + nr_targets = 1; } if (id_is_pid) { - for (i = 0; i < nr_targets; i++) { - targets[i] = (unsigned long)find_get_pid( - (int)targets[i]); - if (!targets[i]) { - dbgfs_put_pids(targets, i); - ret = -EINVAL; - goto free_targets_out; - } + target_pids = str_to_pids(kbuf, count, &nr_targets); + if (!target_pids) { + ret = -ENOMEM; + goto out; } } mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); + dbgfs_put_pids(target_pids, nr_targets); ret = -EBUSY; goto unlock_out; } /* remove previously set targets */ - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); /* Configure the context for the address space type */ if (id_is_pid) @@ -452,14 +481,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, else damon_pa_set_primitives(ctx); - ret = dbgfs_set_targets(ctx, targets, nr_targets); + ret = dbgfs_set_targets(ctx, nr_targets, target_pids); if (!ret) ret = count; unlock_out: mutex_unlock(&ctx->kdamond_lock); -free_targets_out: - kfree(targets); + kfree(target_pids); out: kfree(kbuf); return ret; @@ -688,12 +716,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!targetid_is_pid(ctx)) + if (!target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { - put_pid((struct pid *)t->id); + put_pid(t->pid); damon_destroy_target(t); } mutex_unlock(&ctx->kdamond_lock); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index bc476cef688e8..29da37192e4a0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -387,8 +387,7 @@ static int __init damon_reclaim_init(void) damon_pa_set_primitives(ctx); ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - /* 4242 means nothing but fun */ - target = damon_new_target(4242); + target = damon_new_target(); if (!target) { damon_destroy_ctx(ctx); return -ENOMEM; diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 6a1b9272ea123..f0d0ba591792c 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, struct damon_region *r; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < nr_regions / 2; i++) { r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); damon_add_region(r, t); @@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test) static void damon_test_split_evenly_fail(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); damon_add_region(r, t); @@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test, static void damon_test_split_evenly_succ(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); unsigned long expected_width = (end - start) / nr_pieces; unsigned long i = 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 89b6468da2b9b..f98edb90a873c 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -23,12 +23,12 @@ #endif /* - * 't->id' should be the pointer to the relevant 'struct pid' having reference + * 't->pid' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ static inline struct task_struct *damon_get_task_struct(struct damon_target *t) { - return get_pid_task((struct pid *)t->id, PIDTYPE_PID); + return get_pid_task(t->pid, PIDTYPE_PID); } /* From 98aae7ec99f31a4f9304897666c6789ab4786e76 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 27 Jan 2022 16:26:07 +1100 Subject: [PATCH 115/154] mm/damon: remove redundant page validation It will never get a NULL page by pte_page() as discussed in thread [1], thus remove the redundant page validation to fix below Smatch static checker warning. mm/damon/vaddr.c:405 damon_hugetlb_mkold() warn: 'page' can't be NULL. [1] https://lore.kernel.org/linux-mm/20220106091200.GA14564@kili/ Link: https://lkml.kernel.org/r/6d32f7d201b8970d53f51b6c5717d472aed2987c.1642386715.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reported-by: Dan Carpenter Reviewed-by: SeongJae Park Acked-by: David Rientjes Acked-by: Souptick Joarder Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/vaddr.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index f98edb90a873c..6d3454dd3204b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -402,9 +402,6 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, pte_t entry = huge_ptep_get(pte); struct page *page = pte_page(entry); - if (!page) - return; - get_page(page); if (pte_young(entry)) { @@ -564,9 +561,6 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, goto out; page = pte_page(entry); - if (!page) - goto out; - get_page(page); if (pte_young(entry) || !page_is_idle(page) || From 7d9d26e8134764df7314b030bc3c3faa8224fcc7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 27 Jan 2022 16:26:07 +1100 Subject: [PATCH 116/154] fs/buffer.c: add debug print for __getblk_gfp() stall problem Among syzbot's unresolved hung task reports, 18 out of 65 reports contain __getblk_gfp() line in the backtrace. Since there is a comment block that says that __getblk_gfp() will lock up the machine if try_to_free_buffers() attempt from grow_dev_page() is failing, let's start from checking whether syzbot is hitting that case. This change will be removed after the bug is fixed. Link: http://lkml.kernel.org/r/9b9fcdda-c347-53ee-fdbb-8a7d11cf430e@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Al Viro Cc: Mel Gorman Cc: Michal Hocko Cc: Andi Kleen Cc: Jan Kara Cc: Jeff Layton Cc: Cc: Tim Chen Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 50 +++++++++++++++++++++++++++++++++++++++++-- include/linux/sched.h | 7 ++++++ lib/Kconfig.debug | 6 ++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd3719..a427edf2664c6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1041,6 +1060,18 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -3192,6 +3223,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3230,11 +3266,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3258,6 +3301,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/include/linux/sched.h b/include/linux/sched.h index f5b2be39a78cb..04a990004d75d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1492,6 +1492,13 @@ struct task_struct { struct callback_head l1d_flush_kill; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 14b89aa37c5c9..f8319dbd76283 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1789,6 +1789,12 @@ config IO_STRICT_DEVMEM menu "$(SRCARCH) Debugging" +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu From b3c3defe6bd665762d78187c0a964fa1cc1f20be Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 27 Jan 2022 16:26:08 +1100 Subject: [PATCH 117/154] fs/buffer.c: dump more info for __getblk_gfp() stall problem We need to dump more variables on top of "fs/buffer.c: add debug print for __getblk_gfp() stall problem". Link: http://lkml.kernel.org/r/12239545-7d8a-820f-48ba-952e2e98a05c@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a427edf2664c6..54fd7c94fdb24 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1064,9 +1064,15 @@ __getblk_slow(struct block_device *bdev, sector_t block, #ifdef CONFIG_DEBUG_AID_FOR_SYZBOT if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) continue; - printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n", current->comm, current->pid, current->getblk_executed, - current->getblk_bh_count, current->getblk_bh_state); + current->getblk_bh_count, current->getblk_bh_state, + IS_ERR_OR_NULL(bdev->bd_super) ? -1L : + bdev->bd_super->s_blocksize, size, + IS_ERR_OR_NULL(bdev->bd_super) ? -1 : + bdev->bd_super->s_blocksize_bits, + IS_ERR_OR_NULL(bdev->bd_inode) ? -1 : + bdev->bd_inode->i_blkbits); current->getblk_executed = 0; current->getblk_bh_count = 0; current->getblk_bh_state = 0; From 7f204d5498897b90b2354a71db823772b015d2cc Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 27 Jan 2022 16:26:08 +1100 Subject: [PATCH 118/154] kernel/hung_task.c: Monitor killed tasks. syzbot's current top report is "no output from test machine" where the userspace process failed to spawn a new test process for 300 seconds for some reason. One of reasons which can result in this report is that an already spawned test process was unable to terminate (e.g. trapped at an unkillable retry loop due to some bug) after SIGKILL was sent to that process. Therefore, reporting when a thread is failing to terminate despite a fatal signal is pending would give us more useful information. In the context of syzbot's testing where there are only 2 CPUs in the target VM (which means that only small number of threads and not so much memory) and threads get SIGKILL after 5 seconds from fork(), being unable to reach do_exit() within 10 seconds is likely a sign of something went wrong. Therefore, I would like to try this patch in linux-next.git for feasibility testing whether this patch helps finding more bugs and reproducers for such bugs, by bringing "unable to terminate threads" reports out of "no output from test machine" reports. Potential bad effect of this patch will be that kernel code becomes killable without addressing the root cause of being unable to terminate, for use of killable wait will bypass both TASK_UNINTERRUPTIBLE stall test and SIGKILL after 5 seconds behavior, which will result in failing to detect in real systems where SIGKILL won't be sent after 5 seconds when something went wrong. This version shares existing sysctl settings (e.g. check interval, timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE threads. We will likely want to use different sysctl settings for monitoring killed threads. But let's start as linux-next.git patch without introducing new sysctl settings. We can add sysctl settings before sending to linux.git. Link: http://lkml.kernel.org/r/60d1d7f6-b201-3dcb-a51b-76a31bcfa919@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Petr Mladek Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Vitaly Kuznetsov Cc: Liu Chuansheng Cc: Valdis Kletnieks Cc: linux-kernel@vger.kernel.org Cc: Dmitry Vyukov Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched.h | 1 + kernel/hung_task.c | 44 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 04a990004d75d..f1a969a71ebb4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1065,6 +1065,7 @@ struct task_struct { #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; + unsigned long killed_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f76554..40220dfd6fa93 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -147,6 +147,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); } +static void check_killed_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long stamp = t->killed_time; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + /* + * Skip threads which are already inside do_exit(), for exit_mm() etc. + * might take many seconds. + */ + if (t->flags & PF_EXITING) + return; + if (!stamp) { + stamp = jiffies; + if (!stamp) + stamp++; + t->killed_time = stamp; + return; + } + if (time_is_after_jiffies(stamp + timeout * HZ)) + return; + trace_sched_process_hang(t); + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_call_panic = true; + } + /* + * This thread failed to terminate for more than + * sysctl_hung_task_timeout_secs seconds, complain: + */ + pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n", + t->comm, t->pid, (jiffies - stamp) / HZ); + sched_show_task(t); + hung_task_show_lock = true; + touch_nmi_watchdog(); +} + /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. @@ -198,6 +239,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } + /* Check threads which are about to terminate. */ + if (unlikely(fatal_signal_pending(t))) + check_killed_task(t, timeout); /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); From 34148b981c0b118f8f6a60da9fd7d3d18ede2bf8 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Thu, 27 Jan 2022 16:26:08 +1100 Subject: [PATCH 119/154] proc: alloc PATH_MAX bytes for /proc/${pid}/fd/ symlinks It's not a standard approach that use __get_free_page() to alloc path buffer directly. We'd better use kmalloc and PATH_MAX. PAGE_SIZE is different on different archs. An unlinked file with very long canonical pathname will readlink differently because "(deleted)" eats into a buffer. --adobriyan Link: https://lkml.kernel.org/r/Ye1fCxyZZ0I5lgOL@localhost.localdomain Signed-off-by: Hao Lee Signed-off-by: Alexey Dobriyan Cc: Christian Brauner Cc: Kees Cook Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d654ce7150fdd..9e4d1e0e3ad79 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1764,25 +1764,25 @@ static const char *proc_pid_get_link(struct dentry *dentry, static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)__get_free_page(GFP_KERNEL); + char *tmp = (char *)kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; - pathname = d_path(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; - len = tmp + PAGE_SIZE - 1 - pathname; + len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: - free_page((unsigned long)tmp); + kfree(tmp); return len; } From 1f5b4ba9387547204eb6034e9e16b6e8405e3396 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 27 Jan 2022 16:26:08 +1100 Subject: [PATCH 120/154] proc-alloc-path_max-bytes-for-proc-pid-fd-symlinks-fix remove now-unneeded cast Reported-by: kernel test robot Cc: Alexey Dobriyan Cc: Christian Brauner Cc: Hao Lee Cc: James Morris Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e4d1e0e3ad79..76bf1aa3cfe88 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1764,7 +1764,7 @@ static const char *proc_pid_get_link(struct dentry *dentry, static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)kmalloc(PATH_MAX, GFP_KERNEL); + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; From 5c068e5a35b52ca1e285a56aab2675f7240fceb5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 27 Jan 2022 16:26:09 +1100 Subject: [PATCH 121/154] proc/vmcore: fix possible deadlock on concurrent mmap and read Lockdep noticed that there is chance for a deadlock if we have concurrent mmap, concurrent read, and the addition/removal of a callback. As nicely explained by Boqun: " Lockdep warned about the above sequences because rw_semaphore is a fair read-write lock, and the following can cause a deadlock: TASK 1 TASK 2 TASK 3 ====== ====== ====== down_write(mmap_lock); down_read(vmcore_cb_rwsem) down_write(vmcore_cb_rwsem); // blocked down_read(vmcore_cb_rwsem); // cannot get the lock because of the fairness down_read(mmap_lock); // blocked IOW, a reader can block another read if there is a writer queued by the second reader and the lock is fair. " To fix, convert to srcu to make this deadlock impossible. We need srcu as our callbacks can sleep. With this change, I cannot trigger any lockdep warnings. [ 6.386519] ====================================================== [ 6.387203] WARNING: possible circular locking dependency detected [ 6.387965] 5.17.0-0.rc0.20220117git0c947b893d69.68.test.fc36.x86_64 #1 Not tainted [ 6.388899] ------------------------------------------------------ [ 6.389657] makedumpfile/542 is trying to acquire lock: [ 6.390308] ffffffff832d2eb8 (vmcore_cb_rwsem){.+.+}-{3:3}, at: mmap_vmcore+0x340/0x580 [ 6.391290] [ 6.391290] but task is already holding lock: [ 6.391978] ffff8880af226438 (&mm->mmap_lock#2){++++}-{3:3}, at: vm_mmap_pgoff+0x84/0x150 [ 6.392898] [ 6.392898] which lock already depends on the new lock. [ 6.392898] [ 6.393866] [ 6.393866] the existing dependency chain (in reverse order) is: [ 6.394762] [ 6.394762] -> #1 (&mm->mmap_lock#2){++++}-{3:3}: [ 6.395530] lock_acquire+0xc3/0x1a0 [ 6.396047] __might_fault+0x4e/0x70 [ 6.396562] _copy_to_user+0x1f/0x90 [ 6.397093] __copy_oldmem_page+0x72/0xc0 [ 6.397663] read_from_oldmem+0x77/0x1e0 [ 6.398229] read_vmcore+0x2c2/0x310 [ 6.398742] proc_reg_read+0x47/0xa0 [ 6.399265] vfs_read+0x101/0x340 [ 6.399751] __x64_sys_pread64+0x5d/0xa0 [ 6.400314] do_syscall_64+0x43/0x90 [ 6.400778] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 6.401390] [ 6.401390] -> #0 (vmcore_cb_rwsem){.+.+}-{3:3}: [ 6.402063] validate_chain+0x9f4/0x2670 [ 6.402560] __lock_acquire+0x8f7/0xbc0 [ 6.403054] lock_acquire+0xc3/0x1a0 [ 6.403509] down_read+0x4a/0x140 [ 6.403948] mmap_vmcore+0x340/0x580 [ 6.404403] proc_reg_mmap+0x3e/0x90 [ 6.404866] mmap_region+0x504/0x880 [ 6.405322] do_mmap+0x38a/0x520 [ 6.405744] vm_mmap_pgoff+0xc1/0x150 [ 6.406258] ksys_mmap_pgoff+0x178/0x200 [ 6.406823] do_syscall_64+0x43/0x90 [ 6.407339] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 6.407975] [ 6.407975] other info that might help us debug this: [ 6.407975] [ 6.408945] Possible unsafe locking scenario: [ 6.408945] [ 6.409684] CPU0 CPU1 [ 6.410196] ---- ---- [ 6.410703] lock(&mm->mmap_lock#2); [ 6.411121] lock(vmcore_cb_rwsem); [ 6.411792] lock(&mm->mmap_lock#2); [ 6.412465] lock(vmcore_cb_rwsem); [ 6.412873] [ 6.412873] *** DEADLOCK *** [ 6.412873] [ 6.413522] 1 lock held by makedumpfile/542: [ 6.414006] #0: ffff8880af226438 (&mm->mmap_lock#2){++++}-{3:3}, at: vm_mmap_pgoff+0x84/0x150 [ 6.414944] [ 6.414944] stack backtrace: [ 6.415432] CPU: 0 PID: 542 Comm: makedumpfile Not tainted 5.17.0-0.rc0.20220117git0c947b893d69.68.test.fc36.x86_64 #1 [ 6.416581] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 6.417272] Call Trace: [ 6.417593] [ 6.417882] dump_stack_lvl+0x5d/0x78 [ 6.418346] print_circular_bug+0x5d7/0x5f0 [ 6.418821] ? stack_trace_save+0x3a/0x50 [ 6.419273] ? save_trace+0x3d/0x330 [ 6.419681] check_noncircular+0xd1/0xe0 [ 6.420217] validate_chain+0x9f4/0x2670 [ 6.420715] ? __lock_acquire+0x8f7/0xbc0 [ 6.421234] ? __lock_acquire+0x8f7/0xbc0 [ 6.421685] __lock_acquire+0x8f7/0xbc0 [ 6.422127] lock_acquire+0xc3/0x1a0 [ 6.422535] ? mmap_vmcore+0x340/0x580 [ 6.422965] ? lock_is_held_type+0xe2/0x140 [ 6.423432] ? mmap_vmcore+0x340/0x580 [ 6.423893] down_read+0x4a/0x140 [ 6.424321] ? mmap_vmcore+0x340/0x580 [ 6.424800] mmap_vmcore+0x340/0x580 [ 6.425237] ? vm_area_alloc+0x1c/0x60 [ 6.425661] ? trace_kmem_cache_alloc+0x30/0xe0 [ 6.426174] ? kmem_cache_alloc+0x1e0/0x2f0 [ 6.426641] proc_reg_mmap+0x3e/0x90 [ 6.427052] mmap_region+0x504/0x880 [ 6.427462] do_mmap+0x38a/0x520 [ 6.427842] vm_mmap_pgoff+0xc1/0x150 [ 6.428260] ksys_mmap_pgoff+0x178/0x200 [ 6.428701] do_syscall_64+0x43/0x90 [ 6.429126] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 6.429745] RIP: 0033:0x7fc7359b8fc7 [ 6.430157] Code: 00 00 00 89 ef e8 69 b3 ff ff eb e4 e8 c2 64 01 00 66 90 f3 0f 1e fa 41 89 ca 41 f7 c1 ff 0f 00 00 75 10 b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 21 c3 48 8b 05 21 7e 0e 00 64 c7 00 16 00 00 [ 6.432147] RSP: 002b:00007fff35b4c208 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 [ 6.432970] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc7359b8fc7 [ 6.433746] RDX: 0000000000000001 RSI: 0000000000400000 RDI: 0000000000000000 [ 6.434529] RBP: 000055a1125ecf10 R08: 0000000000000003 R09: 0000000000002000 [ 6.435310] R10: 0000000000000002 R11: 0000000000000246 R12: 0000000000002000 [ 6.436093] R13: 0000000000400000 R14: 000055a1124269e2 R15: 0000000000000000 [ 6.436887] Link: https://lkml.kernel.org/r/20220119193417.100385-1-david@redhat.com Fixes: cc5f2704c934 ("proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks") Signed-off-by: David Hildenbrand Reported-by: Baoquan He Acked-by: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Peter Zijlstra Cc: Boqun Feng Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/vmcore.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 702754dd1daff..edeb01dfe05d3 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DECLARE_RWSEM(vmcore_cb_rwsem); +static DEFINE_SPINLOCK(vmcore_cb_lock); +DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ @@ -70,8 +71,8 @@ static bool vmcore_opened; void register_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); INIT_LIST_HEAD(&cb->next); + spin_lock(&vmcore_cb_lock); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); - list_del(&cb->next); + spin_lock(&vmcore_cb_lock); + list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is * very unusual (e.g., forced driver removal), but we cannot stop @@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); + + synchronize_srcu(&vmcore_cb_srcu); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn) struct vmcore_cb *cb; bool ret = true; - lockdep_assert_held_read(&vmcore_cb_rwsem); - - list_for_each_entry(cb, &vmcore_cb_list, next) { + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { if (unlikely(!cb->pfn_is_ram)) continue; ret = cb->pfn_is_ram(cb, pfn); @@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - down_read(&vmcore_cb_rwsem); + spin_lock(&vmcore_cb_lock); vmcore_opened = true; - up_read(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); return 0; } @@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, unsigned long pfn, offset; size_t nr_bytes; ssize_t read = 0, tmp; + int idx; if (!count) return 0; @@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset, userbuf); } if (tmp < 0) { - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return tmp; } @@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count, ++pfn; offset = 0; } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); - up_read(&vmcore_cb_rwsem); return read; } @@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - int ret; + int ret, idx; /* - * Check if oldmem_pfn_is_ram was registered to avoid - * looping over all pages without a reason. + * Check if a callback was registered to avoid looping over all + * pages without a reason. */ - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return ret; } From de37a47ddb5e9209b81f51042cff8b6d7cc4a4cf Mon Sep 17 00:00:00 2001 From: Julius Hemanth Pitti Date: Thu, 27 Jan 2022 16:26:09 +1100 Subject: [PATCH 122/154] proc/sysctl: make protected_* world readable protected_* files have 600 permissions which prevents non-superuser from reading them. Container like "AWS greengrass" refuse to launch unless protected_hardlinks and protected_symlinks are set. When containers like these run with "userns-remap" or "--user" mapping container's root to non-superuser on host, they fail to run due to denied read access to these files. As these protections are hardly a secret, and do not possess any security risk, making them world readable. Though above greengrass usecase needs read access to only protected_hardlinks and protected_symlinks files, setting all other protected_* files to 644 to keep consistency. Link: http://lkml.kernel.org/r/20200709235115.56954-1-jpitti@cisco.com Fixes: 800179c9b8a1 ("fs: add link restrictions") Signed-off-by: Julius Hemanth Pitti Acked-by: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Cc: Ingo Molnar Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index b867a92c078ec..53a1a735d259f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1031,7 +1031,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1040,7 +1040,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1049,7 +1049,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1058,7 +1058,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, From 880f694bd1cef484e92065ea4e736850abce5352 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 27 Jan 2022 16:26:09 +1100 Subject: [PATCH 123/154] Kconfig.debug: make DEBUG_INFO selectable from a choice Currently it's not possible to enable DEBUG_INFO for an all*config build, since it is marked as "depends on !COMPILE_TEST". This generally makes sense because a debug build of an all*config target ends up taking much longer and the output is much larger. Having this be "default off" makes sense. However, there are cases where enabling DEBUG_INFO for such builds is useful for doing treewide A/B comparisons of build options, etc. Make DEBUG_INFO selectable from any of the DWARF version choice options, with DEBUG_INFO_NONE being the default for COMPILE_TEST. The mutually exclusive relationship between DWARF5 and BTF must be inverted, but the result remains the same. Additionally moves DEBUG_KERNEL and DEBUG_MISC up to the top of the menu because they were enabling features _above_ it, making it weird to navigate menuconfig. Link: https://lkml.kernel.org/r/20220125075126.891825-1-keescook@chromium.org Signed-off-by: Kees Cook Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Reviewed-by: Masahiro Yamada Cc: Nick Desaulniers Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.debug | 140 +++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f8319dbd76283..4807637ec89eb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -208,20 +208,88 @@ config DEBUG_BUGVERBOSE endmenu # "printk and dmesg options" +config DEBUG_KERNEL + bool "Kernel debugging" + help + Say Y here if you are developing drivers or trying to debug and + identify kernel problems. + +config DEBUG_MISC + bool "Miscellaneous debug code" + default DEBUG_KERNEL + depends on DEBUG_KERNEL + help + Say Y here if you need to enable miscellaneous debug code that should + be under a more specific debug option but isn't. + menu "Compile-time checks and compiler options" config DEBUG_INFO - bool "Compile the kernel with debug info" - depends on DEBUG_KERNEL && !COMPILE_TEST + bool help - If you say Y here the resulting kernel image will include - debugging info resulting in a larger kernel image. + A kernel debug info option other than "None" has been selected + in the "Debug information" choice below, indicating that debug + information will be generated for build targets. + +choice + prompt "Debug information" + depends on DEBUG_KERNEL + default DEBUG_INFO_NONE if COMPILE_TEST + help + Selecting something other than "None" results in a kernel image + that will include debugging info resulting in a larger kernel image. This adds debug symbols to the kernel and modules (gcc -g), and is needed if you intend to use kernel crashdump or binary object tools like crash, kgdb, LKCD, gdb, etc on the kernel. - Say Y here only if you plan to debug the kernel. - If unsure, say N. + Choose which version of DWARF debug info to emit. If unsure, + select "Toolchain default". + +config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT + bool "Rely on the toolchain's implicit default DWARF version" + select DEBUG_INFO + help + The implicit default version of DWARF debug info produced by a + toolchain changes over time. + + This can break consumers of the debug info that haven't upgraded to + support newer revisions, and prevent testing newer versions, but + those should be less common scenarios. + +config DEBUG_INFO_DWARF4 + bool "Generate DWARF Version 4 debuginfo" + select DEBUG_INFO + help + Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. + + If you have consumers of DWARF debug info that are not ready for + newer revisions of DWARF, you may wish to choose this or have your + config select this. + +config DEBUG_INFO_DWARF5 + bool "Generate DWARF Version 5 debuginfo" + select DEBUG_INFO + depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) + help + Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc + 5.0+ accepts the -gdwarf-5 flag but only had partial support for some + draft features until 7.0), and gdb 8.0+. + + Changes to the structure of debug info in Version 5 allow for around + 15-18% savings in resulting image and debug info section sizes as + compared to DWARF Version 4. DWARF Version 5 standardizes previous + extensions such as accelerators for symbol indexing and the format + for fission (.dwo/.dwp) files. Users may not want to select this + config if they rely on tooling that has not yet been updated to + support DWARF Version 5. + +config DEBUG_INFO_NONE + bool "Disable debug information" + help + Do not build the kernel with debugging information, which will + result in a faster and smaller build. + +endchoice # "Debug information" if DEBUG_INFO @@ -267,56 +335,12 @@ config DEBUG_INFO_SPLIT to know about the .dwo files and include them. Incompatible with older versions of ccache. -choice - prompt "DWARF version" - help - Which version of DWARF debug info to emit. - -config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT - bool "Rely on the toolchain's implicit default DWARF version" - help - The implicit default version of DWARF debug info produced by a - toolchain changes over time. - - This can break consumers of the debug info that haven't upgraded to - support newer revisions, and prevent testing newer versions, but - those should be less common scenarios. - - If unsure, say Y. - -config DEBUG_INFO_DWARF4 - bool "Generate DWARF Version 4 debuginfo" - help - Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. - - If you have consumers of DWARF debug info that are not ready for - newer revisions of DWARF, you may wish to choose this or have your - config select this. - -config DEBUG_INFO_DWARF5 - bool "Generate DWARF Version 5 debuginfo" - depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) - depends on !DEBUG_INFO_BTF - help - Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc - 5.0+ accepts the -gdwarf-5 flag but only had partial support for some - draft features until 7.0), and gdb 8.0+. - - Changes to the structure of debug info in Version 5 allow for around - 15-18% savings in resulting image and debug info section sizes as - compared to DWARF Version 4. DWARF Version 5 standardizes previous - extensions such as accelerators for symbol indexing and the format - for fission (.dwo/.dwp) files. Users may not want to select this - config if they rely on tooling that has not yet been updated to - support DWARF Version 5. - -endchoice # "DWARF version" - config DEBUG_INFO_BTF bool "Generate BTF typeinfo" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL + depends on !DEBUG_INFO_DWARF5 help Generate deduplicated BTF type information from DWARF debug info. Turning this on expects presence of pahole tool, which will convert @@ -585,20 +609,6 @@ source "lib/Kconfig.kcsan" endmenu -config DEBUG_KERNEL - bool "Kernel debugging" - help - Say Y here if you are developing drivers or trying to debug and - identify kernel problems. - -config DEBUG_MISC - bool "Miscellaneous debug code" - default DEBUG_KERNEL - depends on DEBUG_KERNEL - help - Say Y here if you need to enable miscellaneous debug code that should - be under a more specific debug option but isn't. - menu "Networking Debugging" source "net/Kconfig.debug" From ba88f76472585ca1101c7110071fd91755ab9486 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Thu, 27 Jan 2022 16:26:09 +1100 Subject: [PATCH 124/154] lz4: fix LZ4_decompress_safe_partial read out of bound When partialDecoding, it is EOF if we've either, filled the output buffer or can't proceed with reading an offset for following match. In some extreme corner cases when compressed data is crusted corrupted, UAF will occur. As reported by KASAN [1], LZ4_decompress_safe_partial may lead to read out of bound problem during decoding. lz4 upstream has fixed it [2] and this issue has been disscussed here [3] before. current decompression routine was ported from lz4 v1.8.3, bumping lib/lz4 to v1.9.+ is certainly a huge work to be done later, so, we'd better fix it first. [1] https://lore.kernel.org/all/000000000000830d1205cf7f0477@google.com/ [2] https://github.com/lz4/lz4/commit/c5d6f8a8be3927c0bec91bcc58667a6cfad244ad# [3] https://lore.kernel.org/all/CC666AE8-4CA4-4951-B6FB-A2EFDE3AC03B@fb.com/ Link: https://lkml.kernel.org/r/20211111105048.2006070-1-guoxuenan@huawei.com Reported-by: syzbot+63d688f1d899c588fb71@syzkaller.appspotmail.com Signed-off-by: Guo Xuenan Reviewed-by: Nick Terrell Cc: Gao Xiang Cc: Yann Collet Cc: Chengyang Fan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/lz4/lz4_decompress.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 926f4823d5eac..fd1728d94babb 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -271,8 +271,12 @@ static FORCE_INLINE int LZ4_decompress_generic( ip += length; op += length; - /* Necessarily EOF, due to parsing restrictions */ - if (!partialDecoding || (cpy == oend)) + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) break; } else { /* may overwrite up to WILDCOPYLENGTH beyond cpy */ From ff63009241b7946edfb500f8c76a3ea697f34b35 Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Thu, 27 Jan 2022 16:26:10 +1100 Subject: [PATCH 125/154] fs/binfmt_elf: fix AT_PHDR for unusual ELF files Patch series "fs/binfmt_elf: Fix AT_PHDR for unusual ELF files", v4. These patches fix a bug in AT_PHDR calculation. We cannot calculate AT_PHDR as the sum of load_addr and exec->e_phoff. This is because exec->e_phoff is the offset of PHDRs in the file and the address of PHDRs in the memory may differ from it. These patches fix the bug by calculating the address of program headers from PT_LOADs directly. This patch (of 2): As pointed out in the bugzilla discussion, we cannot calculate AT_PHDR as the sum of load_addr and exec->e_phoff. : The AT_PHDR of ELF auxiliary vectors should point to the memory address : of program header. But binfmt_elf.c calculates this address as follows: : : NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); : : which is wrong since e_phoff is the file offset of program header and : load_addr is the memory base address from PT_LOAD entry. : : The ld.so uses AT_PHDR as the memory address of program header. In normal : case, since the e_phoff is usually 64 and in the first PT_LOAD region, it : is the correct program header address. : : But if the address of program header isn't equal to the first PT_LOAD : address + e_phoff (e.g. Put the program header in other non-consecutive : PT_LOAD region), ld.so will try to read program header from wrong address : then crash or use incorrect program header. This is because exec->e_phoff is the offset of PHDRs in the file and the address of PHDRs in the memory may differ from it. This patch fixes the bug by calculating the address of program headers from PT_LOADs directly. Link: https://lkml.kernel.org/r/20211212232414.1402199-1-akirakawata1@gmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=197921 Link: https://lkml.kernel.org/r/20211212232414.1402199-2-akirakawata1@gmail.com Signed-off-by: Akira Kawata Reported-by: kernel test robot Cc: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Cc: Lukas Bulwahn Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 605017eb9349e..2b8d5ca0acc82 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -170,8 +170,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -823,7 +823,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; + unsigned long load_addr, load_bias = 0, phdr_addr = 0; int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; @@ -1169,6 +1169,13 @@ static int load_elf_binary(struct linux_binprm *bprm) reloc_func_desc = load_bias; } } + + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1204,6 +1211,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1267,8 +1275,8 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; From d97d683eb090e7fa6edda79b55fd111a69d6d34d Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Thu, 27 Jan 2022 16:26:10 +1100 Subject: [PATCH 126/154] fs/binfmt_elf: refactor load_elf_binary function I delete load_addr because it is not used anymore. And I rename load_addr_set to first_pt_load because it is used only to capture the first iteration of the loop. Link: https://lkml.kernel.org/r/20211212232414.1402199-3-akirakawata1@gmail.com Signed-off-by: Akira Kawata Cc: Alexey Dobriyan Cc: Al Viro Cc: Eric Biederman Cc: Kees Cook Cc: kernel test robot Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2b8d5ca0acc82..b7864cdee67d3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -823,8 +823,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr, load_bias = 0, phdr_addr = 0; - int load_addr_set = 0; + unsigned long load_bias = 0, phdr_addr = 0; + int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; @@ -1074,12 +1074,12 @@ static int load_elf_binary(struct linux_binprm *bprm) vaddr = elf_ppnt->p_vaddr; /* - * The first time through the loop, load_addr_set is false: + * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (load_addr_set) { + if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* @@ -1139,10 +1139,10 @@ static int load_elf_binary(struct linux_binprm *bprm) /* * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the + * (Note that first_pt_load is set to false later once the * initial mapping is performed.) */ - if (!load_addr_set) { + if (first_pt_load) { total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { @@ -1159,13 +1159,11 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out_free_dentry; } - if (!load_addr_set) { - load_addr_set = 1; - load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (first_pt_load) { + first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += load_bias; reloc_func_desc = load_bias; } } From 8ad743eab00a2aff284fc5e495894a5218784df5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 27 Jan 2022 16:26:10 +1100 Subject: [PATCH 127/154] ELF: fix overflow in total mapping size calculation Kernel assumes that ELF program headers are ordered by mapping address, but doesn't enforce it. It is possible to make mapping size extremely huge by simply shuffling first and last PT_LOAD segments. As long as PT_LOAD segments do not overlap, it is silly to require sorting by v_addr anyway because mmap() doesn't care. Don't assume PT_LOAD segments are sorted and calculate min and max addresses correctly. Link: https://lkml.kernel.org/r/YVmd7D0M6G/DcP4O@localhost.localdomain Signed-off-by: Alexey Dobriyan Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b7864cdee67d3..8ebcb88e595e6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) @@ -399,22 +399,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { - int i, first_idx = -1, last_idx = -1; + elf_addr_t min_addr = -1; + elf_addr_t max_addr = 0; + bool pt_load = false; + int i; for (i = 0; i < nr; i++) { - if (cmds[i].p_type == PT_LOAD) { - last_idx = i; - if (first_idx == -1) - first_idx = i; + if (phdr[i].p_type == PT_LOAD) { + min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); + max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); + pt_load = true; } } - if (first_idx == -1) - return 0; - - return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - - ELF_PAGESTART(cmds[first_idx].p_vaddr); + return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) From e0f3a0f88b754aca6249cab9f4aa5497e218fcd9 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Thu, 27 Jan 2022 16:26:10 +1100 Subject: [PATCH 128/154] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index 65fa2e41a9c09..cf6452628ca1f 100644 --- a/init/main.c +++ b/init/main.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -230,13 +230,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -473,7 +473,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL, NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -502,7 +502,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -527,7 +528,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -727,7 +729,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1352,8 +1355,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1490,7 +1495,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From 3e4038909071591e4e1909062022e48e8874f5ed Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 27 Jan 2022 16:26:11 +1100 Subject: [PATCH 129/154] fs/pipe: use kvcalloc to allocate a pipe_buffer array Right now, kcalloc is used to allocate a pipe_buffer array. The size of the pipe_buffer struct is 40 bytes. kcalloc allows allocating reliably chunks with sizes less or equal to PAGE_ALLOC_COSTLY_ORDER (3). It means that the maximum pipe size is 3.2MB in this case. In CRIU, we use pipes to dump processes memory. CRIU freezes a target process, injects a parasite code into it and then this code splices memory into pipes. If a maximum pipe size is small, we need to do many iterations or create many pipes. kvcalloc attempt to allocate physically contiguous memory, but upon failure, fall back to non-contiguous (vmalloc) allocation and so it isn't limited by PAGE_ALLOC_COSTLY_ORDER. The maximum pipe size for non-root users is limited by the /proc/sys/fs/pipe-max-size sysctl that is 1MB by default, so only the root user will be able to trigger vmalloc allocations. Link: https://lkml.kernel.org/r/20220104171058.22580-1-avagin@gmail.com Signed-off-by: Andrei Vagin Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/pipe.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b61..3e3413a4ccc29 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -803,7 +803,7 @@ struct pipe_inode_info *alloc_pipe_info(void) if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { @@ -846,7 +846,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) } if (pipe->tmp_page) __free_page(pipe->tmp_page); - kfree(pipe->bufs); + kvfree(pipe->bufs); kfree(pipe); } @@ -1261,8 +1261,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) if (nr_slots < n) return -EBUSY; - bufs = kcalloc(nr_slots, sizeof(*bufs), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT); if (unlikely(!bufs)) return -ENOMEM; @@ -1289,7 +1288,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) head = n; tail = 0; - kfree(pipe->bufs); + kvfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) From 69a291fe57c5334ebf0ecb14191f9d48d5ad29f6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 27 Jan 2022 16:26:11 +1100 Subject: [PATCH 130/154] fs/pipe.c: local vars have to match types of proper pipe_inode_info fields head, tail, ring_size are declared as unsigned int, so all local variables that operate with these fields have to be unsigned to avoid signed integer overflow. Right now, it isn't an issue because the maximum pipe size is limited by 1U<<31. Link: https://lkml.kernel.org/r/20220106171946.36128-1-avagin@gmail.com Signed-off-by: Andrei Vagin Suggested-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/pipe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 3e3413a4ccc29..71946832e33f9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -606,7 +606,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, head, tail, mask; + unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: @@ -828,7 +828,7 @@ struct pipe_inode_info *alloc_pipe_info(void) void free_pipe_info(struct pipe_inode_info *pipe) { - int i; + unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) { From c6fe9caa89f71bcf5f69f40a8a9a4a98495b027d Mon Sep 17 00:00:00 2001 From: Qinghua Jin Date: Thu, 27 Jan 2022 16:26:11 +1100 Subject: [PATCH 131/154] minix: fix bug when opening a file with O_DIRECT Testcase: 1. create a minix file system and mount it 2. open a file on the file system with O_RDWR|O_CREAT|O_TRUNC|O_DIRECT 3. open fails with -EINVAL but leaves an empty file behind. All other open() failures don't leave the failed open files behind. It is hard to check the direct_IO op before creating the inode. Just as ext4 and btrfs do, this patch will resolve the issue by allowing to create the file with O_DIRECT but returning error when writing the file. Link: https://lkml.kernel.org/r/20220107133626.413379-1-qhjin.dev@gmail.com Signed-off-by: Qinghua Jin Reported-by: Colin Ian King Reviewed-by: Jan Kara Acked-by: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/minix/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a71f1cf894b9f..d4bd94234ef73 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { From 616879c0eeacf052eeb46c6ce220427b141d9510 Mon Sep 17 00:00:00 2001 From: Ariadne Conill Date: Thu, 27 Jan 2022 16:26:12 +1100 Subject: [PATCH 132/154] fs/exec: require argv[0] presence in do_execveat_common() In several other operating systems, it is a hard requirement that the second argument to execve(2) be the name of a program, thus prohibiting a scenario where argc < 1. POSIX 2017 also recommends this behaviour, but it is not an explicit requirement[0]: The argument arg0 should point to a filename string that is associated with the process being started by one of the exec functions. To ensure that execve(2) with argc < 1 is not a useful tool for shellcode to use, we can validate this in do_execveat_common() and fail for this scenario, effectively blocking successful exploitation of CVE-2021-4034 and similar bugs which depend on execve(2) working with argc < 1. We use -EINVAL for this case, mirroring recent changes to FreeBSD and OpenBSD. -EINVAL is also used by QNX for this, while Solaris uses -EFAULT. In earlier versions of the patch, it was proposed that we create a fake argv for applications to use when argc < 1, but it was concluded that it would be better to just fail the execve(2) in these cases, as launching a process with an empty or NULL argv[0] was likely to just cause more problems. Interestingly, Michael Kerrisk opened an issue about this in 2008[1], but there was no consensus to support fixing this issue then. Hopefully now that CVE-2021-4034 shows practical exploitative use[2] of this bug in a shellcode, we can reconsider. This issue is being tracked in the KSPP issue tracker[3]. There are a few[4][5] minor edge cases (primarily in test suites) that are caught by this, but we plan to work with the projects to fix those edge cases. [0]: https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html [1]: https://bugzilla.kernel.org/show_bug.cgi?id=8408 [2]: https://www.qualys.com/2022/01/25/cve-2021-4034/pwnkit.txt [3]: https://github.com/KSPP/linux/issues/176 [4]: https://codesearch.debian.net/search?q=execve%5C+*%5C%28%5B%5E%2C%5D%2B%2C+*NULL&literal=0 [5]: https://codesearch.debian.net/search?q=execlp%3F%5Cs*%5C%28%5B%5E%2C%5D%2B%2C%5Cs*NULL&literal=0 Link: https://lkml.kernel.org/r/20220127000724.15106-1-ariadne@dereferenced.org Signed-off-by: Ariadne Conill Reported-by: Michael Kerrisk Cc: Matthew Wilcox Cc: Christian Brauner Cc: Rich Felker Cc: Eric Biederman Cc: Alexander Viro Cc: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/exec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/exec.c b/fs/exec.c index 79f2c9483302d..982730cfe3b86 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1897,6 +1897,10 @@ static int do_execveat_common(int fd, struct filename *filename, } retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) { + pr_warn_once("Attempted to run process '%s' with NULL argv\n", bprm->filename); + retval = -EINVAL; + } if (retval < 0) goto out_free; bprm->argc = retval; From a1d6aa15fad8c54584dc1c97f7263a8e715dd199 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 16:26:12 +1100 Subject: [PATCH 133/154] kexec: make crashk_res, crashk_low_res and crash_notes symbols always visible Patch series "kexec: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef", v2. Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. I only modified x86, arm, arm64 and riscv, other architectures such as sh, powerpc and s390 are better to be kept kexec code as-is so they are not touched. This patch (of 5): Make the forward declarations of crashk_res, crashk_low_res and crash_notes always visible. Code referring to these symbols can then just check for IS_ENABLED(CONFIG_KEXEC_CORE), instead of requiring conditional compilation using an #ifdef, thus preparing to increase compile coverage and simplify the code. Link: https://lkml.kernel.org/r/20211206160514.2000-1-jszhang@kernel.org Link: https://lkml.kernel.org/r/20211206160514.2000-2-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Baoquan He Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Eric W. Biederman Cc: Alexandre Ghiti Cc: Palmer Dabbelt Cc: Russell King (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kexec.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0c994ae37729e..58d1b58a971e3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -20,6 +20,12 @@ #include +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; +extern note_buf_t __percpu *crash_notes; + #ifdef CONFIG_KEXEC_CORE #include #include @@ -350,12 +356,6 @@ extern int kexec_load_disabled; #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; -extern note_buf_t __percpu *crash_notes; - /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; From a2060e4ef5d2f254f01d2c4f5b68899697987df4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 16:26:12 +1100 Subject: [PATCH 134/154] riscv: mm: init: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-3-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Palmer Dabbelt Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Russell King (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/riscv/mm/init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index cf4d018b7d668..4f9cf5337b4f6 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -955,7 +955,6 @@ static inline void setup_vm_final(void) } #endif /* CONFIG_MMU */ -#ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -972,6 +971,8 @@ static void __init reserve_crashkernel(void) int ret = 0; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; /* * Don't reserve a region for a crash kernel on a crash kernel * since it doesn't make much sense and we have limited memory @@ -1021,7 +1022,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } -#endif /* CONFIG_KEXEC_CORE */ void __init paging_init(void) { @@ -1035,9 +1035,7 @@ void __init misc_mem_init(void) arch_numa_init(); sparse_init(); zone_sizes_init(); -#ifdef CONFIG_KEXEC_CORE reserve_crashkernel(); -#endif memblock_dump_all(); } From ee47d088d003eb7aedc5fd2f29827ba9fb37a398 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 16:26:12 +1100 Subject: [PATCH 135/154] x86/setup: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-4-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Russell King (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/kernel/setup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7a132eb794d8..af2d2dc438a20 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -391,8 +391,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC_CORE - /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -470,6 +468,9 @@ static void __init reserve_crashkernel(void) bool high = false; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ @@ -535,11 +536,6 @@ static void __init reserve_crashkernel(void) crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, From 03fc0f446ce09c61e8a4e25db768f5a063c9fdd1 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 16:26:13 +1100 Subject: [PATCH 136/154] arm64: mm: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-5-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Catalin Marinas Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Russell King (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/mm/init.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index db63cc885771a..3973e305adc89 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -64,7 +64,6 @@ EXPORT_SYMBOL(memstart_addr); */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -#ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -78,6 +77,9 @@ static void __init reserve_crashkernel(void) unsigned long long crash_max = arm64_dma_phys_limit; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); /* no crashkernel= or invalid value specified */ @@ -110,11 +112,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif /* CONFIG_KEXEC_CORE */ /* * Return the maximum physical address for a zone accessible by the given bits From 35f56ebeffca44e35eda0a5c6a6ce2cf632b03f9 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 16:26:13 +1100 Subject: [PATCH 137/154] arm: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-6-jszhang@kernel.org Signed-off-by: Jisheng Zhang Reviewed-by: Russell King (Oracle) Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/kernel/setup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 284a80c0b6e10..f9a054a344c0c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -973,7 +973,6 @@ static int __init init_machine_late(void) } late_initcall(init_machine_late); -#ifdef CONFIG_KEXEC /* * The crash region must be aligned to 128MB to avoid * zImage relocating below the reserved region. @@ -1001,6 +1000,9 @@ static void __init reserve_crashkernel(void) unsigned long long total_mem; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + total_mem = get_total_mem(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); @@ -1056,9 +1058,6 @@ static void __init reserve_crashkernel(void) insert_resource(&iomem_resource, &crashk_boot_res); } } -#else -static inline void reserve_crashkernel(void) {} -#endif /* CONFIG_KEXEC */ void __init hyp_mode_check(void) { From 295f5a76ced111ad80438b2f8b51861a2f7feb3e Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 27 Jan 2022 16:26:13 +1100 Subject: [PATCH 138/154] docs: sysctl/kernel: add missing bit to panic_print Patch series "Some improvements on panic_print". This is a mix of a documentation fix with some additions to the "panic_print" syscall / parameter. The goal here is being able to collect all CPUs backtraces during a panic event and also to enable "panic_print" in a kdump event - details of the reasoning and design choices in the patches. This patch (of 3): Commit de6da1e8bcf0 ("panic: add an option to replay all the printk message in buffer") added a new bit to the sysctl/kernel parameter "panic_print", but the documentation was added only in kernel-parameters.txt, not in the sysctl guide. Fix it here by adding bit 5 to sysctl admin-guide documentation. Link: https://lkml.kernel.org/r/20211109202848.610874-1-gpiccoli@igalia.com Link: https://lkml.kernel.org/r/20211109202848.610874-2-gpiccoli@igalia.com Fixes: de6da1e8bcf0 ("panic: add an option to replay all the printk message in buffer") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Feng Tang Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Samuel Iglesias Gonsalvez Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index d359bcfadd39a..1406aa3d14ce1 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -795,6 +795,7 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer +bit 5: print all printk messages in buffer ===== ============================================ So for example to print tasks and memory info on panic, user can:: From 4fa793206e0029cd0ac786e135c1dfa2cb103657 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 27 Jan 2022 16:26:13 +1100 Subject: [PATCH 139/154] panic: add option to dump all CPUs backtraces in panic_print Currently the "panic_print" parameter/sysctl allows some interesting debug information to be printed during a panic event. This is useful for example in cases the user cannot kdump due to resource limits, or if the user collects panic logs in a serial output (or pstore) and prefers a fast reboot instead of a kdump. Happens that currently there's no way to see all CPUs backtraces in a panic using "panic_print" on architectures that support that. We do have "oops_all_cpu_backtrace" sysctl, but although partially overlapping in the functionality, they are orthogonal in nature: "panic_print" is a panic tuning (and we have panics without oopses, like direct calls to panic() or maybe other paths that don't go through oops_enter() function), and the original purpose of "oops_all_cpu_backtrace" is to provide more information on oopses for cases in which the users desire to continue running the kernel even after an oops, i.e., used in non-panic scenarios. So, we hereby introduce an additional bit for "panic_print" to allow dumping the CPUs backtraces during a panic event. Link: https://lkml.kernel.org/r/20211109202848.610874-3-gpiccoli@igalia.com Signed-off-by: Guilherme G. Piccoli Reviewed-by: Feng Tang Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: Samuel Iglesias Gonsalvez Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kernel-parameters.txt | 1 + Documentation/admin-guide/sysctl/kernel.rst | 1 + kernel/panic.c | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 85f096fddad9d..a069d8fe2fee1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3726,6 +3726,7 @@ bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer bit 5: print all printk messages in buffer + bit 6: print all CPUs backtrace (if available in the arch) panic_on_taint= Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1406aa3d14ce1..d663aebfeefe8 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -796,6 +796,7 @@ bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer bit 5: print all printk messages in buffer +bit 6: print all CPUs backtrace (if available in the arch) ===== ============================================ So for example to print tasks and memory info on panic, user can:: diff --git a/kernel/panic.c b/kernel/panic.c index 55b50e052ec3a..c578da84e7fb0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_PRINT_ALL_CPU_BT 0x00000040 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -152,6 +153,9 @@ static void panic_print_sys_info(void) if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); From 239699ae1da2f11e6a4f6582a5bbcd6118c761c4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 27 Jan 2022 16:26:14 +1100 Subject: [PATCH 140/154] panic: allow printing extra panic information on kdump Currently we have the "panic_print" parameter/sysctl to allow some extra information to be printed in a panic event. On the other hand, the kdump mechanism allows to kexec a new kernel to collect a memory dump for the running kernel in case of panic. Right now these options are incompatible: the user either sets the kdump or makes use of "panic_print". The code path of "panic_print" isn't reached when kdump is configured. There are situations though in which this would be interesting: for example, in systems that are very memory constrained, a handcrafted tiny kernel/initrd for kdump might be used in order to only collect the dmesg in kdump kernel. Even more common, systems with no disk space for the full (compressed) memory dump might very well rely in this functionality too, dumping only the dmesg with the additional information provided by "panic_print". So, this is what the patch does: allows both functionality to co-exist; if "panic_print" is set and the system performs a kdump, the extra information is printed on dmesg before the kexec. Some notes about the design choices here: (a) We could have introduced a sysctl or an extra bit on "panic_print" to allow enabling the co-existence of kdump and "panic_print", but seems that would be over-engineering; we have 3 cases, let's check how this patch change things: - if the user have kdump set and not "panic_print", nothing changes; - if the user have "panic_print" set and not kdump, nothing changes; - if both are enabled, now we print the extra information before kdump, which is exactly the goal of the patch (and should be the goal of the user, since they enabled both options). (b) We assume that the code path won't return from __crash_kexec() so we didn't guard against double execution of panic_print_sys_info(). Link: https://lkml.kernel.org/r/20211109202848.610874-4-gpiccoli@igalia.com Signed-off-by: Guilherme G. Piccoli Cc: Feng Tang Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: Samuel Iglesias Gonsalvez Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/panic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/panic.c b/kernel/panic.c index c578da84e7fb0..41ecf9ab824a6 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -244,6 +244,13 @@ void panic(const char *fmt, ...) */ kgdb_panic(buf); + /* + * If we have a kdump kernel loaded, give a chance to panic_print + * show some extra information on kernel log if it was set... + */ + if (kexec_crash_loaded()) + panic_print_sys_info(); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. From 3799fd5b93e96ce6920ee4c904745aa6a8c16070 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Thu, 27 Jan 2022 16:26:14 +1100 Subject: [PATCH 141/154] kcov: split ioctl handling into locked and unlocked parts Patch series "kcov: improve mmap processing", v3. Subsequent mmaps of the same kcov descriptor currently do not update the virtual memory of the task and yet return 0 (success). This is counter-intuitive and may lead to unexpected memory access errors. Also, this unnecessarily limits the functionality of kcov to only the simplest usage scenarios. Kcov instances are effectively forever attached to their first address spaces and it becomes impossible to e.g. reuse the same kcov handle in forked child processes without mmapping the memory first. This is exactly what we tried to do in syzkaller and inadvertently came upon this behavior. This patch series addresses the problem described above. This patch (of 3): Currently all ioctls are de facto processed under a spinlock in order to serialise them. This, however, prohibits the use of vmalloc and other memory management functions in the implementations of those ioctls, unnecessary complicating any further changes to the code. Let all ioctls first be processed inside the kcov_ioctl() function which should execute the ones that are not compatible with spinlock and then pass control to kcov_ioctl_locked() for all other ones. KCOV_REMOTE_ENABLE is processed both in kcov_ioctl() and kcov_ioctl_locked() as the steps are easily separable. Although it is still compatible with a spinlock, move KCOV_INIT_TRACE handling to kcov_ioctl(), so that the changes from the next commit are easier to follow. Link: https://lkml.kernel.org/r/20220117153634.150357-1-nogikh@google.com Link: https://lkml.kernel.org/r/20220117153634.150357-2-nogikh@google.com Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Taras Madan Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/kcov.c | 68 ++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e7..e1be7301500bd 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -564,31 +564,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +673,32 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. Later we allocate + * size * sizeof(unsigned long) memory, that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + return -EBUSY; + } + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +714,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { From 1c09b5cac0e7c17d8e920079d0e2018112fb3269 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Thu, 27 Jan 2022 16:26:14 +1100 Subject: [PATCH 142/154] kcov: properly handle subsequent mmap calls Allocate the kcov buffer during KCOV_MODE_INIT in order to untie mmapping of a kcov instance and the actual coverage collection process. Modify kcov_mmap, so that it can be reliably used any number of times once KCOV_MODE_INIT has succeeded. These changes to the user-facing interface of the tool only weaken the preconditions, so all existing user space code should remain compatible with the new version. Link: https://lkml.kernel.org/r/20220117153634.150357-3-nogikh@google.com Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Marco Elver Cc: Sebastian Andrzej Siewior Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/kcov.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index e1be7301500bd..475524bd900ab 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); - } - return 0; + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -674,6 +665,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) unsigned int remote_num_handles; unsigned long remote_arg_size; unsigned long size, flags; + void *area; kcov = filep->private_data; switch (cmd) { @@ -683,17 +675,21 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) * Must happen before anything else. * * First check the size argument - it must be at least 2 - * to hold the current position and one PC. Later we allocate - * size * sizeof(unsigned long) memory, that must not overflow. + * to hold the current position and one PC. */ size = arg; if (size < 2 || size > INT_MAX / sizeof(unsigned long)) return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; spin_lock_irqsave(&kcov->lock, flags); if (kcov->mode != KCOV_MODE_DISABLED) { spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); return -EBUSY; } + kcov->area = area; kcov->size = size; kcov->mode = KCOV_MODE_INIT; spin_unlock_irqrestore(&kcov->lock, flags); From 8c969be7b9c42e86c4ccde8797364de1111fb731 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:14 +1100 Subject: [PATCH 143/154] selftests: set the BUILD variable to absolute path Patch series "selftests: Fix separate output directory builds", v2. Build of several selftests fail if separate output directory is specified by the following methods: 1) make -C tools/testing/selftests O= 2) export KBUILD_OUTPUT="build_dir"; make -C tools/testing/selftests Build fails because of several reasons: 1) The kernel headers aren't found. 2) The path of output objects is wrong and hence unaccessible. These problems can be solved by: 1) Including the correct path of uapi header files 2) By setting the BUILD variable correctly inside Makefile Following different build scenarios have been tested after making these changes to verify that nothing gets broken with these changes: make -C tools/testing/selftests make -C tools/testing/selftests/futex make -C tools/testing/selftests/kvm make -C tools/testing/selftests/landlock make -C tools/testing/selftests/net make -C tools/testing/selftests/net/mptcp make -C tools/testing/selftests/vm make -C tools/testing/selftests O=build make -C tools/testing/selftests o=/opt/build export KBUILD_OUTPUT="/opt/build"; make -C tools/testing/selftests export KBUILD_OUTPUT="build"; make -C tools/testing/selftests cd ; make -C /tools/testing/selftests cd ; make -C /tools/testing/selftests O=build This patch (of 10): The build of kselftests fails if relative path is specified through KBUILD_OUTPUT or O= method. BUILD variable is used to determine the path of the output objects. When make is run from other directories with relative paths, the exact path of the build objects is ambiguous and build fails. make[1]: Entering directory '/home/usama/repos/kernel/linux_mainline2/tools/testing/selftests/alsa' gcc mixer-test.c -L/usr/lib/x86_64-linux-gnu -lasound -o build/kselftest/alsa/mixer-test /usr/bin/ld: cannot open output file build/kselftest/alsa/mixer-test Set the BUILD variable to the absolute path of the output directory. Make the logic readable and easy to follow. Use spaces instead of tabs for indentation as if with tab indentation is considered recipe in make. Link: https://lkml.kernel.org/r/20220119101531.2850400-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20220119101531.2850400-2-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Andr Almeida Cc: Paolo Bonzini Cc: Mickal Salan Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Minghao Chi Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d08fe4cfe8115..a7b63860b7bce 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -114,19 +114,27 @@ ifdef building_out_of_srctree override LDFLAGS = endif -ifneq ($(O),) - BUILD := $(O)/kselftest +top_srcdir ?= ../../.. + +ifeq ("$(origin O)", "command line") + KBUILD_OUTPUT := $(O) +endif + +ifneq ($(KBUILD_OUTPUT),) + # Make's built-in functions such as $(abspath ...), $(realpath ...) cannot + # expand a shell special character '~'. We use a somewhat tedious way here. + abs_objtree := $(shell cd $(top_srcdir) && mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) && pwd) + $(if $(abs_objtree),, \ + $(error failed to create output directory "$(KBUILD_OUTPUT)")) + # $(realpath ...) resolves symlinks + abs_objtree := $(realpath $(abs_objtree)) + BUILD := $(abs_objtree)/kselftest else - ifneq ($(KBUILD_OUTPUT),) - BUILD := $(KBUILD_OUTPUT)/kselftest - else - BUILD := $(shell pwd) - DEFAULT_INSTALL_HDR_PATH := 1 - endif + BUILD := $(CURDIR) + DEFAULT_INSTALL_HDR_PATH := 1 endif # Prepare for headers install -top_srcdir ?= ../../.. include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) export KSFT_KHDR_INSTALL_DONE := 1 From 75ee885b6102c54679141c023df0f80297aaab5a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:15 +1100 Subject: [PATCH 144/154] selftests: add and export a kernel uapi headers path Kernel uapi headers can be present at different paths depending upon how the build was invoked. It becomes impossible for the tests to include the correct headers directory. Set and export KHDR_INCLUDES variable to make it possible for sub make files to include the header files. Link: https://lkml.kernel.org/r/20220119101531.2850400-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index a7b63860b7bce..21f983dfd047b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -129,8 +129,11 @@ ifneq ($(KBUILD_OUTPUT),) # $(realpath ...) resolves symlinks abs_objtree := $(realpath $(abs_objtree)) BUILD := $(abs_objtree)/kselftest + KHDR_INCLUDES := -I${abs_objtree}/usr/include else BUILD := $(CURDIR) + abs_srctree := $(shell cd $(top_srcdir) && pwd) + KHDR_INCLUDES := -I${abs_srctree}/usr/include DEFAULT_INSTALL_HDR_PATH := 1 endif @@ -139,6 +142,7 @@ include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) export KSFT_KHDR_INSTALL_DONE := 1 export BUILD +export KHDR_INCLUDES # set default goal to all, so make without a target runs all, even when # all isn't the first target in the file. From 4508c2b7233243e7e9612d33237aa71c2a169837 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:15 +1100 Subject: [PATCH 145/154] selftests: correct the headers install path uapi headers should be installed at the top of the object tree, "/usr/include". There is no need for kernel headers to be present at kselftest build directory, "/kselftest/usr/ include" as well. This duplication can be avoided by correctly specifying the INSTALL_HDR_PATH. Link: https://lkml.kernel.org/r/20220119101531.2850400-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 21f983dfd047b..80e5498eab92a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -167,7 +167,7 @@ khdr: ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install else - $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \ + $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$(abs_objtree)/usr \ ARCH=$(ARCH) -C $(top_srcdir) headers_install endif From c71e22072bf0591f5c5d54e7ae15b9566156f4e2 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:15 +1100 Subject: [PATCH 146/154] selftests: futex: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. KBUILD_OUTPUT also doesn't point to the correct directory when relative path is used. Thus out of tree builds fail. Remove the un-needed include paths and use KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-5-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/futex/functional/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 5cc38de9d8ea1..2a12b174cb04f 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -INCLUDES := -I../include -I../../ -I../../../../../usr/include/ \ - -I$(KBUILD_OUTPUT)/kselftest/usr/include -CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) +INCLUDES := -I../include -I../../ -I../../../../../usr/include/ +CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) $(KHDR_INCLUDES) LDLIBS := -lpthread -lrt HEADERS := \ From e9e46cd1237fdea3e618c523ba7d1eaa120dd048 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:16 +1100 Subject: [PATCH 147/154] selftests: kvm: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-6-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 81ebf99d6ff07..97952e1de6555 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -149,7 +149,7 @@ endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ - -I$( Date: Thu, 27 Jan 2022 16:26:16 +1100 Subject: [PATCH 148/154] selftests: landlock: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-7-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/landlock/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index a99596ca9882b..0b0049e133bba 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -Wall -O2 +CFLAGS += -Wall -O2 $(KHDR_INCLUDES) src_test := $(wildcard *_test.c) From 0923633fbe1172f91b4f33f10dc0baa6a74f2e3e Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:16 +1100 Subject: [PATCH 149/154] selftests: net: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-8-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/net/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 9897fa9ab9537..0b1488616c551 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -2,7 +2,7 @@ # Makefile for net selftests CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES) TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh From 771a2af76e60f6fc70b9aa7c82087085e6a6f018 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:16 +1100 Subject: [PATCH 150/154] selftests: mptcp: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-9-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Reviewed-by: Matthieu Baerts Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/net/mptcp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 0356c4501c990..f905d5358e681 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -3,7 +3,7 @@ top_srcdir = ../../../../.. KSFT_KHDR_INSTALL := 1 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh From fb8315254fad3a1542d2e39ce50f0d2a4bc73d18 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:17 +1100 Subject: [PATCH 151/154] selftests: vm: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-10-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Acked-by: Paolo Bonzini Tested-by: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7d100a7dc4624..96714d2d49dca 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -23,7 +23,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) +CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_test From 52952d386342bf3ca9ce52c5861a1acfa28db073 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 16:26:17 +1100 Subject: [PATCH 152/154] selftests: vm: remove dependecy from internal kernel macros The defination of swap() is used from kernel's internal header when this test is built in source tree. The build fails when this test is built out of source tree as defination of swap() isn't found. Selftests shouldn't depend on kernel's internal header files. They can only depend on uapi header files. Add the defination of swap() to fix the build error: gcc -Wall -I/linux_mainline2/build/usr/include -no-pie userfaultfd.c -lrt -lpthread -o /linux_mainline2/build/kselftest/vm/userfaultfd userfaultfd.c: In function `userfaultfd_stress': userfaultfd.c:1530:3: warning: implicit declaration of function `swap'; did you mean `swab'? [-Wimplicit-function-declaration] 1530 | swap(area_src, area_dst); | ^~~~ | swab /usr/bin/ld: /tmp/cclUUH7V.o: in function `userfaultfd_stress': userfaultfd.c:(.text+0x4d64): undefined reference to `swap' /usr/bin/ld: userfaultfd.c:(.text+0x4d82): undefined reference to `swap' collect2: error: ld returned 1 exit status Link: https://lkml.kernel.org/r/20220119101531.2850400-11-usama.anjum@collabora.com Fixes: 2c769ed7137a ("tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/userfaultfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d3fd24f9fae8c..d2480ab930372 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -119,6 +119,9 @@ struct uffd_stats { ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" From d2569c50a5341f3d65f2dad19ca0f192481f22b6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 27 Jan 2022 16:26:17 +1100 Subject: [PATCH 153/154] ipc/mqueue: use get_tree_nodev() in mqueue_get_tree() When running the stress-ng clone benchmark with multiple testing threads, it was found that there were significant spinlock contention in sget_fc(). The contended spinlock was the sb_lock. It is under heavy contention because the following code in the critcal section of sget_fc(): hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } After testing with added instrumentation code, it was found that the the benchmark could generate thousands of ipc namespaces with the corresponding number of entries in the mqueue's fs_supers list where the namespaces are the key for the search. This leads to excessive time in scanning the list for a match. Looking back at the mqueue calling sequence leading to sget_fc(): mq_init_ns() => mq_create_mount() => fc_mount() => vfs_get_tree() => mqueue_get_tree() => get_tree_keyed() => vfs_get_super() => sget_fc() Currently, mq_init_ns() is the only mqueue function that will indirectly call mqueue_get_tree() with a newly allocated ipc namespace as the key for searching. As a result, there will never be a match with the exising ipc namespaces stored in the mqueue's fs_supers list. So using get_tree_keyed() to do an existing ipc namespace search is just a waste of time. Instead, we could use get_tree_nodev() to eliminate the useless search. By doing so, we can greatly reduce the sb_lock hold time and avoid the spinlock contention problem in case a large number of ipc namespaces are present. Of course, if the code is modified in the future to allow mqueue_get_tree() to be called with an existing ipc namespace instead of a new one, we will have to use get_tree_keyed() in this case. The following stress-ng clone benchmark command was run on a 2-socket 48-core Intel system: ./stress-ng --clone 32 --verbose --oomable --metrics-brief -t 20 The "bogo ops/s" increased from 5948.45 before patch to 9137.06 after patch. This is an increase of 54% in performance. Link: https://lkml.kernel.org/r/20220121172315.19652-1-longman@redhat.com Fixes: 935c6912b198 ("ipc: Convert mqueue fs to fs_context") Signed-off-by: Waiman Long Cc: Al Viro Cc: David Howells Cc: Manfred Spraul Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- ipc/mqueue.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5becca9be867c..089c34d0732cf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -45,6 +45,7 @@ struct mqueue_fs_context { struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 @@ -427,6 +428,14 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); + } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } @@ -454,6 +463,10 @@ static int mqueue_init_fs_context(struct fs_context *fc) return 0; } +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; @@ -465,6 +478,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) return ERR_CAST(fc); ctx = fc->fs_private; + ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); From 205934f054d267a0b2f3ead6a8ca27900c0307af Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 27 Jan 2022 16:26:17 +1100 Subject: [PATCH 154/154] ipc/sem: do not sleep with a spin lock held We can't call kvfree() with a spin lock held, so defer it. Link: https://lkml.kernel.org/r/20211223031207.556189-1-chi.minghao@zte.com.cn Fixes: fc37a3b8b438 ("[PATCH] ipc sem: use kvmalloc for sem_undo allocation") Reported-by: Zeal Robot Signed-off-by: Minghao Chi Reviewed-by: Shakeel Butt Reviewed-by: Manfred Spraul Cc: Arnd Bergmann Cc: Yang Guang Cc: Davidlohr Bueso Cc: Randy Dunlap Cc: Bhaskar Chowdhury Cc: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- ipc/sem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 6693daf4fe112..0dbdb98fdf2d9 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1964,6 +1964,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) */ un = lookup_undo(ulp, semid); if (un) { + spin_unlock(&ulp->lock); kvfree(new); goto success; } @@ -1976,9 +1977,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; - -success: spin_unlock(&ulp->lock); +success: sem_unlock(sma, -1); out: return un;