From f5b23d6704e478b5a97dbba5df9dea96a9cbf847 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:02 +0200 Subject: [PATCH 01/46] hfsplus: unmap the page in the "fail_page" label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "hfsplus: Replace kmap() with kmap_local_page()". kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/hfsplus is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/hfsplus. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Fix a bug due to a page being not unmapped if the code jumps to the "fail_page" label (1/4). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. This patch (of 4): Several paths within hfs_btree_open() jump to the "fail_page" label where put_page() is called while the page is still mapped. Call kunmap() to unmap the page soon before put_page(). Link: https://lkml.kernel.org/r/20220809203105.26183-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220809203105.26183-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Matthew Wilcox Cc: Fabio M. De Francesco Cc: Jens Axboe Cc: Bart Van Assche Cc: Kees Cook Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 66774f4cb4fd5..3a917a9a4edd5 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -245,6 +245,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) return tree; fail_page: + kunmap(page); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; From 6c3014a67a44f11dc1020c8b47a1d1d626f007a9 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:03 +0200 Subject: [PATCH 02/46] hfsplus: convert kmap() to kmap_local_page() in bnode.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in bnode.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bnode.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/bnode.c | 105 +++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index a5ab00e542203..87974d5e67915 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(buf, kmap(*pagep) + off, l); - kunmap(*pagep); + memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(buf, kmap(*++pagep), l); - kunmap(*pagep); + memcpy_from_page(buf, *++pagep, 0, l); } } @@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(kmap(*pagep) + off, buf, l); + memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++pagep), buf, l); + memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memset(kmap(*pagep) + off, 0, l); + memzero_page(*pagep, off, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memset(kmap(*++pagep), 0, l); + memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); - kunmap(*src_page); + memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); - kunmap(*dst_page); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++dst_page), kmap(*++src_page), l); - kunmap(*src_page); + memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); - kunmap(*dst_page); } } else { void *src_ptr, *dst_ptr; do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; src = 0; @@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } l = min(len, l); memcpy(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page **src_page, **dst_page; + void *src_ptr, *dst_ptr; int l; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); @@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { while (src < len) { - memmove(kmap(*dst_page), kmap(*src_page), src); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr, src_ptr, src); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); len -= src; src = PAGE_SIZE; src_page--; dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, len); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr + src, src_ptr + src, len); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (src < dst) { l = src; src = PAGE_SIZE; @@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (dst == PAGE_SIZE) dst_page--; else @@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, l); - kunmap(*src_page); + + dst_ptr = kmap_local_page(*dst_page) + src; + src_ptr = kmap_local_page(*src_page) + src; + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memmove(kmap(*++dst_page), - kmap(*++src_page), l); - kunmap(*src_page); + dst_ptr = kmap_local_page(*++dst_page); + src_ptr = kmap_local_page(*++src_page); + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; @@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + - node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_SIZE, tree->node_size)); + memzero_page(*pagep, node->page_offset, + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); From f9ef3b95a305874de0b36b7294f6cc8a0e07951e Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:04 +0200 Subject: [PATCH 03/46] hfsplus: convert kmap() to kmap_local_page() in bitmap.c kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in bitmap.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bitmap.c. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/bitmap.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index cebce0cfe3405..bd8dcea855880 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; offset &= ~(PAGE_CACHE_BITS - 1); @@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, } curr++; } - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; if (offset >= size) break; @@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - curr = pptr = kmap(page); + curr = pptr = kmap_local_page(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; else @@ -127,7 +127,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, len -= 32; } set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -135,7 +135,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -151,7 +151,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, done: *curr = cpu_to_be32(n); set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); @@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) page = read_mapping_page(mapping, pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; len = count; @@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) break; set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); page = read_mapping_page(mapping, ++pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -231,7 +231,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) } out: set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); sbi->free_blocks += len; hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); From 9f25f357c557bfea39a2d6a629a6317d2b3dfc64 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:05 +0200 Subject: [PATCH 04/46] hfsplus: convert kmap() to kmap_local_page() in btree.c kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in btree.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in btree.c. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-5-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/btree.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 3a917a9a4edd5..9e1732a2b92a8 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -240,12 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: - kunmap(page); + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; @@ -292,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree) return -EIO; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); @@ -304,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); return 0; @@ -395,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -408,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -418,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); @@ -441,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -491,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; @@ -499,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node) pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); From 765f2bf04fdaced4e7d7e94cfc3f743048629f31 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 8 Aug 2022 10:59:28 +0200 Subject: [PATCH 05/46] scripts/decodecode: improve faulting line determination There are cases where the IP pointer in a Code: line in an oops doesn't point at the beginning of an instruction: Code: 0f bd c2 e9 a0 cd b5 e4 48 0f bd c2 e9 97 cd b5 e4 0f 1f 80 00 00 00 00 \ e9 8b cd b5 e4 0f 1f 00 66 0f a3 d0 e9 7f cd b5 e4 0f 1f <80> 00 00 00 \ 00 0f a3 d0 e9 70 cd b5 e4 48 0f a3 d0 e9 67 cd b5 e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 0f 1f 80 00 00 00 00 nopl 0x0(%rax) ^^ and the current way of determining the faulting instruction line doesn't work because disassembled instructions are counted from the IP byte to the end and when that thing points in the middle, the trailing bytes can be interpreted as different insns: Code starting with the faulting instruction =========================================== 0: 80 00 00 addb $0x0,(%rax) 3: 00 00 add %al,(%rax) whereas, this is part of 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 5: 0f a3 d0 bt %edx,%eax ... leading to: 1d: 0f 1f 00 nopl (%rax) 20: 66 0f a3 d0 bt %dx,%ax 24:* e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 <-- trapping instruction 29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 30: 0f a3 d0 bt %edx,%eax which is the wrong faulting instruction. Change the way the faulting line number is determined by matching the opcode bytes from the beginning, leading to correct output: 1d: 0f 1f 00 nopl (%rax) 20: 66 0f a3 d0 bt %dx,%ax 24: e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 29:* 0f 1f 80 00 00 00 00 nopl 0x0(%rax) <-- trapping instruction 30: 0f a3 d0 bt %edx,%eax While at it, make decodecode use bash as the interpreter - that thing should be present on everything by now. It simplifies the code a lot too. Link: https://lkml.kernel.org/r/20220808085928.29840-1-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Andrew Morton --- scripts/decodecode | 120 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 15 deletions(-) diff --git a/scripts/decodecode b/scripts/decodecode index c711a196511c6..b28fd26865617 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Disassemble the Code: line in Linux oopses # usage: decodecode < oops.file @@ -8,6 +8,8 @@ # AFLAGS=--32 decodecode < 386.oops # PC=hex - the PC (program counter) the oops points to +faultlinenum=1 + cleanup() { rm -f $T $T.s $T.o $T.oo $T.aa $T.dis exit 1 @@ -102,28 +104,125 @@ disas() { grep -v "/tmp\|Disassembly\|\.text\|^$" > $t.dis 2>&1 } +# Match the maximum number of opcode bytes from @op_bytes contained within +# @opline +# +# Params: +# @op_bytes: The string of bytes from the Code: line +# @opline: The disassembled line coming from objdump +# +# Returns: +# The max number of opcode bytes from the beginning of @op_bytes which match +# the opcode bytes in the objdump line. +get_substr_opcode_bytes_num() +{ + local op_bytes=$1 + local opline=$2 + + local retval=0 + substr="" + + for opc in $op_bytes; + do + substr+="$opc" + + # return if opcode bytes do not match @opline anymore + if ! echo $opline | grep -q "$substr"; + then + break + fi + + # add trailing space + substr+=" " + retval=$((retval+1)) + done + + return $retval +} + +# Return the line number in objdump output to where the IP marker in the Code: +# line points to +# +# Params: +# @all_code: code in bytes without the marker +# @dis_file: disassembled file +# @ip_byte: The byte to which the IP points to +get_faultlinenum() +{ + local all_code="$1" + local dis_file="$2" + + # num bytes including IP byte + local num_bytes_ip=$(( $3 + 1 * $width )) + + # Add the two header lines (we're counting from 1). + local retval=3 + + # remove marker + all_code=$(echo $all_code | sed -e 's/[<>()]//g') + + while read line + do + get_substr_opcode_bytes_num "$all_code" "$line" + ate_opcodes=$? + + if ! (( $ate_opcodes )); then + continue + fi + + num_bytes_ip=$((num_bytes_ip - ($ate_opcodes * $width) )) + if (( $num_bytes_ip <= 0 )); then + break + fi + + # Delete matched opcode bytes from all_code. For that, compute + # how many chars those opcodes are represented by and include + # trailing space. + # + # a byte is 2 chars, ate_opcodes is also the number of trailing + # spaces + del_chars=$(( ($ate_opcodes * $width * 2) + $ate_opcodes )) + + all_code=$(echo $all_code | sed -e "s!^.\{$del_chars\}!!") + + let "retval+=1" + + done < $dis_file + + return $retval +} + marker=`expr index "$code" "\<"` if [ $marker -eq 0 ]; then marker=`expr index "$code" "\("` fi - touch $T.oo if [ $marker -ne 0 ]; then - # 2 opcode bytes and a single space - pc_sub=$(( $marker / 3 )) + # How many bytes to subtract from the program counter + # in order to get to the beginning virtual address of the + # Code: + pc_sub=$(( (($marker - 1) / (2 * $width + 1)) * $width )) echo All code >> $T.oo echo ======== >> $T.oo beforemark=`echo "$code"` echo -n " .$type 0x" > $T.s + echo $beforemark | sed -e 's/ /,0x/g; s/[<>()]//g' >> $T.s + disas $T $pc_sub + cat $T.dis >> $T.oo - rm -f $T.o $T.s $T.dis -# and fix code at-and-after marker + get_faultlinenum "$code" "$T.dis" $pc_sub + faultlinenum=$? + + # and fix code at-and-after marker code=`echo "$code" | cut -c$((${marker} + 1))-` + + rm -f $T.o $T.s $T.dis fi + echo Code starting with the faulting instruction > $T.aa echo =========================================== >> $T.aa code=`echo $code | sed -e 's/\r//;s/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` @@ -132,15 +231,6 @@ echo $code >> $T.s disas $T 0 cat $T.dis >> $T.aa -# (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3, -# i.e. the title + the "===..=" line (sed is counting from 1, 0 address is -# special) -faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \ - $(wc -l $T.aa | cut -d" " -f1) + 3)) - -faultline=`cat $T.dis | head -1 | cut -d":" -f2-` -faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'` - cat $T.oo | sed -e "${faultlinenum}s/^\([^:]*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/" echo cat $T.aa From 58b5c203360799e181325f3f8ce212de80ebf304 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Fri, 5 Aug 2022 13:57:33 +0200 Subject: [PATCH 06/46] ipc/util.c: cleanup and improve sysvipc_find_ipc() sysvipc_find_ipc() can be simplified further: - It uses a for() loop to locate the next entry in the idr. This can be replaced with idr_get_next(). - It receives two parameters (pos - which is actually an idr index and not a position, and new_pos, which is really a position). One parameter is sufficient. Link: https://lore.kernel.org/all/20210903052020.3265-3-manfred@colorfullife.com/ Link: https://lkml.kernel.org/r/20220805115733.104763-1-manfred@colorfullife.com Signed-off-by: Manfred Spraul Acked-by: Davidlohr Bueso Acked-by: Waiman Long Cc: "Eric W . Biederman" Cc: <1vier1@web.de> Signed-off-by: Andrew Morton --- ipc/util.c | 53 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index a2208d0f26b2d..05cb9de667350 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -782,28 +782,37 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) return iter->pid_ns; } -/* - * This routine locks the ipc structure found at least at position pos. +/** + * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos + * @ids: ipc identifier set + * @pos: expected position + * + * The function finds an ipc structure, based on the sequence file + * position @pos. If there is no ipc structure at position @pos, then + * the successor is selected. + * If a structure is found, then it is locked (both rcu_read_lock() and + * ipc_lock_object()) and @pos is set to the position needed to locate + * the found ipc structure. + * If nothing is found (i.e. EOF), @pos is not modified. + * + * The function returns the found ipc structure, or NULL at EOF. */ -static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, - loff_t *new_pos) +static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { - struct kern_ipc_perm *ipc = NULL; - int max_idx = ipc_get_maxidx(ids); + int tmpidx; + struct kern_ipc_perm *ipc; - if (max_idx == -1 || pos > max_idx) - goto out; + /* convert from position to idr index -> "-1" */ + tmpidx = *pos - 1; - for (; pos <= max_idx; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - rcu_read_lock(); - ipc_lock_object(ipc); - break; - } + ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); + if (ipc != NULL) { + rcu_read_lock(); + ipc_lock_object(ipc); + + /* convert from idr index to position -> "+1" */ + *pos = tmpidx + 1; } -out: - *new_pos = pos + 1; return ipc; } @@ -817,11 +826,13 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); - return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos); + /* Next -> search for *pos+1 */ + (*pos)++; + return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* - * File positions: pos 0 -> header, pos n -> ipc id = n - 1. + * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) @@ -846,8 +857,8 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - /* Find the (pos-1)th ipc */ - return sysvipc_find_ipc(ids, *pos - 1, pos); + /* Otherwise return the correct ipc structure */ + return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) From 64367f2e4f11cfdd983c637d219fb364ab85558c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Aug 2022 13:44:34 +0200 Subject: [PATCH 07/46] treewide: defconfig: address renamed CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO is now implicitly selected if one picks one of the explicit options that could be DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT, DEBUG_INFO_DWARF4, DEBUG_INFO_DWARF5. This was actually not what I had in mind when I suggested making it a 'choice' statement, but it's too late to change again now, and the Kconfig logic is more sensible in the new form. Change any defconfig file that had CONFIG_DEBUG_INFO enabled but did not pick DWARF4 or DWARF5 explicitly to now pick the toolchain default. Link: https://lkml.kernel.org/r/20220811114609.2097335-1-arnd@kernel.org Fixes: f9b3cd245784 ("Kconfig.debug: make DEBUG_INFO selectable from a choice") Signed-off-by: Arnd Bergmann Reviewed-by: Kees Cook Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Vineet Gupta Cc: Michal Simek Cc: Thomas Bogendoerfer Cc: Dinh Nguyen Cc: Yoshinori Sato Cc: Rich Felker Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: Chris Zankel Cc: Max Filippov Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/alpha/configs/defconfig | 2 +- arch/arc/configs/tb10x_defconfig | 2 +- arch/microblaze/configs/mmu_defconfig | 2 +- arch/mips/configs/bcm47xx_defconfig | 2 +- arch/mips/configs/cavium_octeon_defconfig | 2 +- arch/mips/configs/ci20_defconfig | 2 +- arch/mips/configs/cu1000-neo_defconfig | 2 +- arch/mips/configs/cu1830-neo_defconfig | 2 +- arch/mips/configs/generic_defconfig | 2 +- arch/mips/configs/omega2p_defconfig | 2 +- arch/mips/configs/qi_lb60_defconfig | 2 +- arch/mips/configs/vocore2_defconfig | 2 +- arch/nios2/configs/10m50_defconfig | 2 +- arch/nios2/configs/3c120_defconfig | 2 +- arch/sh/configs/apsh4a3a_defconfig | 2 +- arch/sh/configs/apsh4ad0a_defconfig | 2 +- arch/sh/configs/edosk7760_defconfig | 2 +- arch/sh/configs/magicpanelr2_defconfig | 2 +- arch/sh/configs/polaris_defconfig | 2 +- arch/sh/configs/r7780mp_defconfig | 2 +- arch/sh/configs/r7785rp_defconfig | 2 +- arch/sh/configs/rsk7203_defconfig | 2 +- arch/sh/configs/sdk7780_defconfig | 2 +- arch/sh/configs/se7712_defconfig | 2 +- arch/sh/configs/se7721_defconfig | 2 +- arch/sh/configs/sh2007_defconfig | 2 +- arch/sh/configs/sh7757lcr_defconfig | 2 +- arch/sh/configs/sh7785lcr_32bit_defconfig | 2 +- arch/sh/configs/urquell_defconfig | 2 +- arch/um/configs/i386_defconfig | 2 +- arch/um/configs/x86_64_defconfig | 2 +- arch/xtensa/configs/audio_kc705_defconfig | 2 +- arch/xtensa/configs/cadence_csp_defconfig | 2 +- arch/xtensa/configs/generic_kc705_defconfig | 2 +- arch/xtensa/configs/nommu_kc705_defconfig | 2 +- arch/xtensa/configs/smp_lx200_defconfig | 2 +- arch/xtensa/configs/virt_defconfig | 2 +- arch/xtensa/configs/xip_kc705_defconfig | 2 +- 38 files changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 7e93369308800..6a39fe8ce9e5f 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -65,7 +65,7 @@ CONFIG_NFSD=m CONFIG_NLS_CODEPAGE_437=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_ALPHA_LEGACY_START_ADDRESS=y CONFIG_MATHEMU=y CONFIG_CRYPTO_HMAC=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index d93b65008d4af..4a94d1684ed60 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -90,7 +90,7 @@ CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_HEADERS_INSTALL=y diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 51337fffb9473..8150daf04a76c 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -83,7 +83,7 @@ CONFIG_CIFS=y CONFIG_CIFS_STATS2=y CONFIG_ENCRYPTED_KEYS=y CONFIG_DMA_CMA=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KGDB=y CONFIG_KGDB_TESTS=y CONFIG_KGDB_KDB=y diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index 91ce75edbfb46..22ffde722bb9e 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -72,7 +72,7 @@ CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_CRC32_SARWATE=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index 97ceaf080c0c7..7f021a327566b 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -162,7 +162,7 @@ CONFIG_CRYPTO_SHA1_OCTEON=m CONFIG_CRYPTO_SHA256_OCTEON=m CONFIG_CRYPTO_SHA512_OCTEON=m CONFIG_CRYPTO_DES=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index cc69b215854ea..955b6ac581ab7 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -199,7 +199,7 @@ CONFIG_NLS_UTF8=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=32 CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index 5bd55eb32fe55..1cbc9302e1d14 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -113,7 +113,7 @@ CONFIG_PRINTK_TIME=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cu1830-neo_defconfig b/arch/mips/configs/cu1830-neo_defconfig index cc69688962e8c..a0f73f3cd6ce7 100644 --- a/arch/mips/configs/cu1830-neo_defconfig +++ b/arch/mips/configs/cu1830-neo_defconfig @@ -116,7 +116,7 @@ CONFIG_PRINTK_TIME=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig index 714169e411cf0..bbc0d9b1c3983 100644 --- a/arch/mips/configs/generic_defconfig +++ b/arch/mips/configs/generic_defconfig @@ -83,7 +83,7 @@ CONFIG_ROOT_NFS=y # CONFIG_XZ_DEC_ARMTHUMB is not set # CONFIG_XZ_DEC_SPARC is not set CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/omega2p_defconfig b/arch/mips/configs/omega2p_defconfig index fc39ddf610a9b..6a5cb2d6de6b0 100644 --- a/arch/mips/configs/omega2p_defconfig +++ b/arch/mips/configs/omega2p_defconfig @@ -116,7 +116,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRC16=y CONFIG_XZ_DEC=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig index b4448d0876d57..7e5d9741bd5dd 100644 --- a/arch/mips/configs/qi_lb60_defconfig +++ b/arch/mips/configs/qi_lb60_defconfig @@ -166,7 +166,7 @@ CONFIG_NLS_UTF8=y CONFIG_FONTS=y CONFIG_FONT_SUN8x16=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_READABLE_ASM=y CONFIG_KGDB=y diff --git a/arch/mips/configs/vocore2_defconfig b/arch/mips/configs/vocore2_defconfig index a14f8ea5c3867..302cab9bd7bd3 100644 --- a/arch/mips/configs/vocore2_defconfig +++ b/arch/mips/configs/vocore2_defconfig @@ -116,7 +116,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRC16=y CONFIG_XZ_DEC=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig index a7967b4cfb6ed..91c3fce4dc7fe 100644 --- a/arch/nios2/configs/10m50_defconfig +++ b/arch/nios2/configs/10m50_defconfig @@ -74,4 +74,4 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/arch/nios2/configs/3c120_defconfig b/arch/nios2/configs/3c120_defconfig index 423a0c40a1627..c42ad7e162a36 100644 --- a/arch/nios2/configs/3c120_defconfig +++ b/arch/nios2/configs/3c120_defconfig @@ -71,4 +71,4 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/arch/sh/configs/apsh4a3a_defconfig b/arch/sh/configs/apsh4a3a_defconfig index 530498f189904..99931a13a74da 100644 --- a/arch/sh/configs/apsh4a3a_defconfig +++ b/arch/sh/configs/apsh4a3a_defconfig @@ -85,7 +85,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set # CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FTRACE is not set # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 6abd9bd701061..d9fb124bf015a 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -116,7 +116,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_VM=y CONFIG_DWARF_UNWINDER=y # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig index d77f54e906fd0..f427a95bcd21e 100644 --- a/arch/sh/configs/edosk7760_defconfig +++ b/arch/sh/configs/edosk7760_defconfig @@ -107,7 +107,7 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_CRYPTO=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig index 0989ed9295408..ef1d98e35c91f 100644 --- a/arch/sh/configs/magicpanelr2_defconfig +++ b/arch/sh/configs/magicpanelr2_defconfig @@ -84,7 +84,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set CONFIG_DEBUG_KOBJECT=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_CRC_CCITT=m CONFIG_CRC16=m diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig index 246408ec7462b..f42e4867ddc1a 100644 --- a/arch/sh/configs/polaris_defconfig +++ b/arch/sh/configs/polaris_defconfig @@ -79,5 +79,5 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_SG=y diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index f823cc6b18f91..e527cd60a1910 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -101,7 +101,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_PREEMPT is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index f96bc20d4b1ac..a3f952a83d970 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -96,7 +96,7 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_SH_STANDARD_BIOS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_4KSTACKS=y diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 5a54e2b883f0a..d00fafc021e1a 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -112,7 +112,7 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig index 7d6d323598480..41cb588ca99cb 100644 --- a/arch/sh/configs/sdk7780_defconfig +++ b/arch/sh/configs/sdk7780_defconfig @@ -131,7 +131,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_SH_STANDARD_BIOS=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index ee6d28ae08deb..36356223d51c8 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -93,7 +93,7 @@ CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index bad921bc10f8f..46c5a263a2392 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -121,7 +121,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_932=y CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRC_CCITT=y diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig index 79f02f1c0dc83..259c69e3fa227 100644 --- a/arch/sh/configs/sh2007_defconfig +++ b/arch/sh/configs/sh2007_defconfig @@ -159,7 +159,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_SH_STANDARD_BIOS=y CONFIG_CRYPTO_NULL=y diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig index a2700ab165afb..2579dc4bc0c8f 100644 --- a/arch/sh/configs/sh7757lcr_defconfig +++ b/arch/sh/configs/sh7757lcr_defconfig @@ -80,6 +80,6 @@ CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FTRACE is not set # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig index 7eb3c10f28ad2..781ff13227fc9 100644 --- a/arch/sh/configs/sh7785lcr_32bit_defconfig +++ b/arch/sh/configs/sh7785lcr_32bit_defconfig @@ -141,7 +141,7 @@ CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_LATENCYTOP=y # CONFIG_FTRACE is not set CONFIG_CRYPTO_HMAC=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index cb2f56468fe02..ea773c764c5a8 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -139,7 +139,7 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y # CONFIG_FTRACE is not set # CONFIG_DUMP_CODE is not set diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index fb51bd206dbed..c0162286d68b7 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -69,5 +69,5 @@ CONFIG_JOLIET=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 477b873174243..bec6e5d956873 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -67,6 +67,6 @@ CONFIG_JOLIET=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_KERNEL=y diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig index 3be62da8089b8..ef0ebcfbccf91 100644 --- a/arch/xtensa/configs/audio_kc705_defconfig +++ b/arch/xtensa/configs/audio_kc705_defconfig @@ -120,7 +120,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig index fc240737b14de..2665962d247a7 100644 --- a/arch/xtensa/configs/cadence_csp_defconfig +++ b/arch/xtensa/configs/cadence_csp_defconfig @@ -100,7 +100,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig index e9d6b6f6eca11..236c7f23cc10a 100644 --- a/arch/xtensa/configs/generic_kc705_defconfig +++ b/arch/xtensa/configs/generic_kc705_defconfig @@ -107,7 +107,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index fcb620ef37997..8263da9e078d7 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -105,7 +105,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FRAME_POINTER is not set CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_VM=y diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig index a47c85638ec11..7bdffa3a69c60 100644 --- a/arch/xtensa/configs/smp_lx200_defconfig +++ b/arch/xtensa/configs/smp_lx200_defconfig @@ -111,7 +111,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_VM=y CONFIG_LOCKUP_DETECTOR=y diff --git a/arch/xtensa/configs/virt_defconfig b/arch/xtensa/configs/virt_defconfig index 6d1387dfa96fc..98acb7191cb77 100644 --- a/arch/xtensa/configs/virt_defconfig +++ b/arch/xtensa/configs/virt_defconfig @@ -97,7 +97,7 @@ CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_FONTS=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig index 062148e171351..1c3cebaaa71ba 100644 --- a/arch/xtensa/configs/xip_kc705_defconfig +++ b/arch/xtensa/configs/xip_kc705_defconfig @@ -102,7 +102,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set From 5bb6ce3aeb02497668a4e8971268b041aa61de8d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 1 Aug 2022 14:27:09 +0200 Subject: [PATCH 08/46] fs/isofs: replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). Tasks can be preempted and, when scheduled to run again, the kernel virtual addresses are restored and still valid. It is faster than kmap() in kernels with HIGHMEM enabled. Since kmap_local_page() can be safely used in compress.c, it should be called everywhere instead of kmap(). Therefore, replace kmap() with kmap_local_page() in compress.c. Where it is needed, use memzero_page() instead of open coding kmap_local_page() plus memset() to fill the pages with zeros. Delete the redundant flush_dcache_page() in the two call sites of memzero_page(). Tested with mkisofs on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220801122709.8164-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Roman Gushchin Cc: Pali Rohár Cc: Muchun Song Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/isofs/compress.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index b466172eec25b..09285e82eb08c 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_SIZE); - flush_dcache_page(pages[i]); + memzero_page(pages[i], 0, PAGE_SIZE); SetPageUptodate(pages[i]); } return ((loff_t)pcount) << PAGE_SHIFT; @@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, zerr != Z_STREAM_END) { if (!stream.avail_out) { if (pages[curpage]) { - stream.next_out = page_address(pages[curpage]) + stream.next_out = kmap_local_page(pages[curpage]) + poffset; stream.avail_out = PAGE_SIZE - poffset; poffset = 0; @@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, flush_dcache_page(pages[curpage]); SetPageUptodate(pages[curpage]); } + if (stream.next_out != (unsigned char *)zisofs_sink_page) { + kunmap_local(stream.next_out); + stream.next_out = NULL; + } curpage++; } if (!stream.avail_in) @@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, } inflate_out: zlib_inflateEnd(&stream); + if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page) + kunmap_local(stream.next_out); z_eio: mutex_unlock(&zisofs_zlib_lock); @@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, } if (poffset && *pages) { - memset(page_address(*pages) + poffset, 0, - PAGE_SIZE - poffset); - flush_dcache_page(*pages); + memzero_page(*pages, poffset, PAGE_SIZE - poffset); SetPageUptodate(*pages); } return 0; @@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) { + if (pages[i]) ClearPageError(pages[i]); - kmap(pages[i]); - } } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) flush_dcache_page(pages[i]); if (i == full_page && err) SetPageError(pages[i]); - kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); From defdaff15a84c68521c5f02b157fc8541e0356f3 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sat, 13 Aug 2022 15:00:34 -0700 Subject: [PATCH 09/46] checkpatch: add kmap and kmap_atomic to the deprecated list kmap() and kmap_atomic() are being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. kmap_local_page() is safe from any context and is therefore redundant with kmap_atomic() with the exception of any pagefault or preemption disable requirements. However, using kmap_atomic() for these side effects makes the code less clear. So any requirement for pagefault or preemption disable should be made explicitly. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored. Link: https://lkml.kernel.org/r/20220813220034.806698-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Suggested-by: Thomas Gleixner Suggested-by: Fabio M. De Francesco Reviewed-by: Chaitanya Kulkarni Cc: Joe Perches Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 79e759aac543b..9ff219e0a9d56 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -807,6 +807,8 @@ sub find_standard_signature { "rcu_barrier_sched" => "rcu_barrier", "get_state_synchronize_sched" => "get_state_synchronize_rcu", "cond_synchronize_sched" => "cond_synchronize_rcu", + "kmap" => "kmap_local_page", + "kmap_atomic" => "kmap_local_page", ); #Create a search pattern for all these strings to speed up a loop below From 9847f21225c4eb0b843cb2b72ed83b32edb1e6f2 Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Thu, 28 Jul 2022 16:24:34 -0700 Subject: [PATCH 10/46] lib/cmdline: avoid page fault in next_arg An argument list like "arg=val arg2 \"" can trigger a page fault if the page pointed by 'args[0xffffffff]' is not mapped and potential memory corruption otherwise (unlikely but possible if the bogus address is mapped and contents happen to match the ascii value of the quote character). The fix is to ensure that we load 'args[i-1]' only when (i > 0). Prior to this commit the following command would trigger an unhandled page fault in the kernel: root@(none):/linus/fs/fat# insmod ./fat.ko "foo=bar \"" [ 33.870507] BUG: unable to handle page fault for address: ffff888204252608 [ 33.872180] #PF: supervisor read access in kernel mode [ 33.873414] #PF: error_code(0x0000) - not-present page [ 33.874650] PGD 4401067 P4D 4401067 PUD 0 [ 33.875321] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [ 33.876113] CPU: 16 PID: 399 Comm: insmod Not tainted 5.19.0-dbg-DEV #4 [ 33.877193] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014 [ 33.878739] RIP: 0010:next_arg+0xd1/0x110 [ 33.879399] Code: 22 75 1d 41 c6 04 01 00 41 80 f8 22 74 18 eb 35 4c 89 0e 45 31 d2 4c 89 cf 48 c7 02 00 00 00 00 41 80 f8 22 75 1f 41 8d 42 ff <41> 80 3c 01 22 75 14 41 c6 04 01 00 eb 0d 48 c7 02 00 00 00 00 41 [ 33.882338] RSP: 0018:ffffc90001253d08 EFLAGS: 00010246 [ 33.883174] RAX: 00000000ffffffff RBX: ffff888104252608 RCX: 0fc317bba1c1dd00 [ 33.884311] RDX: ffffc90001253d40 RSI: ffffc90001253d48 RDI: ffff888104252609 [ 33.885450] RBP: ffffc90001253d10 R08: 0000000000000022 R09: ffff888104252609 [ 33.886595] R10: 0000000000000000 R11: ffffffff82c7ff20 R12: 0000000000000282 [ 33.887748] R13: 00000000ffff8000 R14: 0000000000000000 R15: 0000000000007fff [ 33.888887] FS: 00007f04ec7432c0(0000) GS:ffff88813d300000(0000) knlGS:0000000000000000 [ 33.890183] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.891111] CR2: ffff888204252608 CR3: 0000000100f36005 CR4: 0000000000170ee0 [ 33.892241] Call Trace: [ 33.892641] [ 33.892989] parse_args+0x8f/0x220 [ 33.893538] load_module+0x138b/0x15a0 [ 33.894149] ? prepare_coming_module+0x50/0x50 [ 33.894879] ? kernel_read_file_from_fd+0x5f/0x90 [ 33.895639] __se_sys_finit_module+0xce/0x130 [ 33.896342] __x64_sys_finit_module+0x1d/0x20 [ 33.897042] do_syscall_64+0x44/0xa0 [ 33.897622] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 33.898434] RIP: 0033:0x7f04ec85ef79 [ 33.899009] Code: 48 8d 3d da db 0d 00 0f 05 eb a5 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c7 9e 0d 00 f7 d8 64 89 01 48 [ 33.901912] RSP: 002b:00007fffae81bfe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 33.903081] RAX: ffffffffffffffda RBX: 0000559c5f1d2640 RCX: 00007f04ec85ef79 [ 33.904191] RDX: 0000000000000000 RSI: 0000559c5f1d12a0 RDI: 0000000000000003 [ 33.905304] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 33.906421] R10: 0000000000000003 R11: 0000000000000246 R12: 0000559c5f1d12a0 [ 33.907526] R13: 0000000000000000 R14: 0000559c5f1d25f0 R15: 0000559c5f1d12a0 [ 33.908631] [ 33.908986] Modules linked in: fat(+) [last unloaded: fat] [ 33.909843] CR2: ffff888204252608 [ 33.910375] ---[ end trace 0000000000000000 ]--- [ 33.911172] RIP: 0010:next_arg+0xd1/0x110 [ 33.911796] Code: 22 75 1d 41 c6 04 01 00 41 80 f8 22 74 18 eb 35 4c 89 0e 45 31 d2 4c 89 cf 48 c7 02 00 00 00 00 41 80 f8 22 75 1f 41 8d 42 ff <41> 80 3c 01 22 75 14 41 c6 04 01 00 eb 0d 48 c7 02 00 00 00 00 41 [ 33.914643] RSP: 0018:ffffc90001253d08 EFLAGS: 00010246 [ 33.915446] RAX: 00000000ffffffff RBX: ffff888104252608 RCX: 0fc317bba1c1dd00 [ 33.916544] RDX: ffffc90001253d40 RSI: ffffc90001253d48 RDI: ffff888104252609 [ 33.917636] RBP: ffffc90001253d10 R08: 0000000000000022 R09: ffff888104252609 [ 33.918727] R10: 0000000000000000 R11: ffffffff82c7ff20 R12: 0000000000000282 [ 33.919821] R13: 00000000ffff8000 R14: 0000000000000000 R15: 0000000000007fff [ 33.920908] FS: 00007f04ec7432c0(0000) GS:ffff88813d300000(0000) knlGS:0000000000000000 [ 33.922125] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.923017] CR2: ffff888204252608 CR3: 0000000100f36005 CR4: 0000000000170ee0 [ 33.924098] Kernel panic - not syncing: Fatal exception [ 33.925776] Kernel Offset: disabled [ 33.926347] Rebooting in 10 seconds.. Link: https://lkml.kernel.org/r/20220728232434.1666488-1-neelnatu@google.com Signed-off-by: Neel Natu Reviewed-by: Eric Dumazet Signed-off-by: Andrew Morton --- lib/cmdline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index 5546bf5887806..90ed997d95701 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -260,7 +260,7 @@ char *next_arg(char *args, char **param, char **val) args[i-1] = '\0'; } } - if (quoted && args[i-1] == '"') + if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { From 7bb5da0d490b2d836c5218f5186ee588d2145310 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:57 +0100 Subject: [PATCH 11/46] kexec: turn all kexec_mutex acquisitions into trylocks Patch series "kexec, panic: Making crash_kexec() NMI safe", v4. This patch (of 2): Most acquistions of kexec_mutex are done via mutex_trylock() - those were a direct "translation" from: 8c5a1cf0ad3a ("kexec: use a mutex for locking rather than xchg()") there have however been two additions since then that use mutex_lock(): crash_get_memory_size() and crash_shrink_memory(). A later commit will replace said mutex with an atomic variable, and locking operations will become atomic_cmpxchg(). Rather than having those mutex_lock() become while (atomic_cmpxchg(&lock, 0, 1)), turn them into trylocks that can return -EBUSY on acquisition failure. This does halve the printable size of the crash kernel, but that's still neighbouring 2G for 32bit kernels which should be ample enough. Link: https://lkml.kernel.org/r/20220630223258.4144112-1-vschneid@redhat.com Link: https://lkml.kernel.org/r/20220630223258.4144112-2-vschneid@redhat.com Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Baoquan He Signed-off-by: Andrew Morton --- include/linux/kexec.h | 2 +- kernel/kexec_core.c | 12 ++++++++---- kernel/ksysfs.c | 7 ++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 13e6c4b58f07d..41a686996aaa3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -427,7 +427,7 @@ extern int kexec_load_disabled; extern bool kexec_in_progress; int crash_shrink_memory(unsigned long new_size); -size_t crash_get_memory_size(void); +ssize_t crash_get_memory_size(void); #ifndef arch_kexec_protect_crashkres /* diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index acd029b307e42..8fa4eeb95b305 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1004,13 +1004,16 @@ void crash_kexec(struct pt_regs *regs) } } -size_t crash_get_memory_size(void) +ssize_t crash_get_memory_size(void) { - size_t size = 0; + ssize_t size = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; - mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); + mutex_unlock(&kexec_mutex); return size; } @@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - mutex_lock(&kexec_mutex); + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; if (kexec_crash_image) { ret = -ENOENT; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b1292a57c2a53..65dba9076f312 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%zu\n", crash_get_memory_size()); + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sprintf(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, From 05c6257433b7212f07a7e53479a8ab038fc1666a Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:58 +0100 Subject: [PATCH 12/46] panic, kexec: make __crash_kexec() NMI safe Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI panic() doesn't work. The cause of that lies in the PREEMPT_RT definition of mutex_trylock(): if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; This prevents an nmi_panic() from executing the main body of __crash_kexec() which does the actual kexec into the kdump kernel. The warning and return are explained by: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") [...] The reasons for this are: 1) There is a potential deadlock in the slowpath 2) Another cpu which blocks on the rtmutex will boost the task which allegedly locked the rtmutex, but that cannot work because the hard/softirq context borrows the task context. Furthermore, grabbing the lock isn't NMI safe, so do away with kexec_mutex and replace it with an atomic variable. This is somewhat overzealous as *some* callsites could keep using a mutex (e.g. the sysfs-facing ones like crash_shrink_memory()), but this has the benefit of involving a single unified lock and preventing any future NMI-related surprises. Tested by triggering NMI panics via: $ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi $ echo 1 > /proc/sys/kernel/unknown_nmi_panic $ echo 1 > /proc/sys/kernel/panic $ ipmitool power diag Link: https://lkml.kernel.org/r/20220630223258.4144112-3-vschneid@redhat.com Fixes: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: Baoquan He Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec.c | 11 ++++------- kernel/kexec_core.c | 20 ++++++++++---------- kernel/kexec_file.c | 4 ++-- kernel/kexec_internal.h | 15 ++++++++++++++- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index b5e40f0697681..cb8e6e6f983c7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, /* * Because we write directly to the reserved memory region when loading - * crash kernels we need a mutex here to prevent multiple crash kernels - * from attempting to load simultaneously, and to prevent a crash kernel - * from loading over the top of a in use crash kernel. - * - * KISS: always take the mutex. + * crash kernels we need a serialization here to prevent multiple crash + * kernels from attempting to load simultaneously. */ - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (flags & KEXEC_ON_CRASH) { @@ -165,7 +162,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, kimage_free(image); out_unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8fa4eeb95b305..5ca4d40c9ec13 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -46,7 +46,7 @@ #include #include "kexec_internal.h" -DEFINE_MUTEX(kexec_mutex); +atomic_t __kexec_lock = ATOMIC_INIT(0); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init); */ void __noclone __crash_kexec(struct pt_regs *regs) { - /* Take the kexec_mutex here to prevent sys_kexec_load + /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - if (mutex_trylock(&kexec_mutex)) { + if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs; @@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - mutex_unlock(&kexec_mutex); + kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); @@ -1008,13 +1008,13 @@ ssize_t crash_get_memory_size(void) { ssize_t size = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); + kexec_unlock(); return size; } @@ -1025,7 +1025,7 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (kexec_crash_image) { @@ -1064,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size) insert_resource(&iomem_resource, ram_res); unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } @@ -1136,7 +1136,7 @@ int kernel_kexec(void) { int error = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1212,6 +1212,6 @@ int kernel_kexec(void) #endif Unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return error; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 1d546dc97c502..45637511e0de6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, image = NULL; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; dest_image = &kexec_image; @@ -411,7 +411,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); + kexec_unlock(); kimage_free(image); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d1..74da1409cd14b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -extern struct mutex kexec_mutex; +/* + * Whatever is used to serialize accesses to the kexec_crash_image needs to be + * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a + * "simple" atomic variable that is acquired with a cmpxchg(). + */ +extern atomic_t __kexec_lock; +static inline bool kexec_trylock(void) +{ + return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0; +} +static inline void kexec_unlock(void) +{ + atomic_set_release(&__kexec_lock, 0); +} #ifdef CONFIG_KEXEC_FILE #include From 4f1d2a030db09af4d21695983674a18633cd0ffb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 16:49:17 +0200 Subject: [PATCH 13/46] llist: use try_cmpxchg in llist_add_batch and llist_del_first Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in llist_add_batch and llist_del_first. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220712144917.4497-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- lib/llist.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/llist.c b/lib/llist.c index 611ce4881a875..7d78b736e8afd 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -30,7 +30,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, do { new_last->next = first = READ_ONCE(head->first); - } while (cmpxchg(&head->first, first, new_first) != first); + } while (!try_cmpxchg(&head->first, &first, new_first)); return !first; } @@ -52,18 +52,14 @@ EXPORT_SYMBOL_GPL(llist_add_batch); */ struct llist_node *llist_del_first(struct llist_head *head) { - struct llist_node *entry, *old_entry, *next; + struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); - for (;;) { + do { if (entry == NULL) return NULL; - old_entry = entry; next = READ_ONCE(entry->next); - entry = cmpxchg(&head->first, old_entry, next); - if (entry == old_entry) - break; - } + } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } From f4068af3a6383da3487c07de85db7732de851734 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Aug 2022 12:50:04 +0300 Subject: [PATCH 14/46] proc: save LOC in vsyscall test Do one fork in vsyscall detection code and let SIGSEGV handler exit and carry information to the parent saving LOC. [adobriyan@gmail.com: redo original patch, delete unnecessary variables, minimise code changes] Link: https://lkml.kernel.org/r/YvoWzAn5dlhF75xa@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Brian Foster Reviewed-by: Brian Foster Tested-by: Brian Foster Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-pid-vm.c | 56 +++++++--------------- 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index e5962f4794f56..69551bfa215c4 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -213,22 +213,22 @@ static int make_exe(const uint8_t *payload, size_t len) /* * 0: vsyscall VMA doesn't exist vsyscall=none - * 1: vsyscall VMA is r-xp vsyscall=emulate - * 2: vsyscall VMA is --xp vsyscall=xonly + * 1: vsyscall VMA is --xp vsyscall=xonly + * 2: vsyscall VMA is r-xp vsyscall=emulate */ -static int g_vsyscall; +static volatile int g_vsyscall; static const char *str_vsyscall; static const char str_vsyscall_0[] = ""; static const char str_vsyscall_1[] = -"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; -static const char str_vsyscall_2[] = "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; +static const char str_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) { - _exit(1); + _exit(g_vsyscall); } /* @@ -255,6 +255,7 @@ static void vsyscall(void) act.sa_sigaction = sigaction_SIGSEGV; (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 0; /* gettimeofday(NULL, NULL); */ asm volatile ( "call %P0" @@ -262,45 +263,20 @@ static void vsyscall(void) : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) : "rax", "rcx", "r11" ); - exit(0); - } - waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page exists and is executable. */ - } else { - /* vsyscall page doesn't exist. */ - g_vsyscall = 0; - return; - } - - pid = fork(); - if (pid < 0) { - fprintf(stderr, "fork, errno %d\n", errno); - exit(1); - } - if (pid == 0) { - struct rlimit rlim = {0, 0}; - (void)setrlimit(RLIMIT_CORE, &rlim); - - /* Hide "segfault at ffffffffff600000" messages. */ - struct sigaction act; - memset(&act, 0, sizeof(struct sigaction)); - act.sa_flags = SA_SIGINFO; - act.sa_sigaction = sigaction_SIGSEGV; - (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 1; *(volatile int *)0xffffffffff600000UL; - exit(0); + + g_vsyscall = 2; + exit(g_vsyscall); } waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page is readable and executable. */ - g_vsyscall = 1; - return; + if (WIFEXITED(wstatus)) { + g_vsyscall = WEXITSTATUS(wstatus); + } else { + fprintf(stderr, "error: wstatus %08x\n", wstatus); + exit(1); } - - /* vsyscall page is executable but unreadable. */ - g_vsyscall = 2; } int main(void) From 2be9880dc87342dc7ae459c9ea5c9ee2a45b33d8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 09:44:06 +0800 Subject: [PATCH 15/46] kernel: exit: cleanup release_thread() Only x86 has own release_thread(), introduce a new weak release_thread() function to clean empty definitions in other ARCHs. Link: https://lkml.kernel.org/r/20220819014406.32266-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Guo Ren [csky] Acked-by: Russell King (Oracle) Acked-by: Geert Uytterhoeven Acked-by: Brian Cain Acked-by: Michael Ellerman [powerpc] Acked-by: Stafford Horne [openrisc] Acked-by: Catalin Marinas [arm64] Acked-by: Huacai Chen [LoongArch] Cc: Alexander Gordeev Cc: Anton Ivanov Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren [csky] Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/processor.h | 2 -- arch/alpha/kernel/process.c | 5 ----- arch/arc/include/asm/processor.h | 3 --- arch/arm/include/asm/processor.h | 3 --- arch/arm/kernel/process.c | 4 ---- arch/arm64/include/asm/processor.h | 3 --- arch/arm64/kernel/process.c | 4 ---- arch/csky/include/asm/processor.h | 5 ----- arch/hexagon/include/asm/processor.h | 4 ---- arch/hexagon/kernel/process.c | 7 ------- arch/ia64/include/asm/processor.h | 7 ------- arch/loongarch/include/asm/processor.h | 3 --- arch/m68k/include/asm/processor.h | 5 ----- arch/microblaze/include/asm/processor.h | 5 ----- arch/mips/include/asm/processor.h | 3 --- arch/nios2/include/asm/processor.h | 5 ----- arch/openrisc/include/asm/processor.h | 1 - arch/openrisc/kernel/process.c | 4 ---- arch/parisc/include/asm/processor.h | 3 --- arch/parisc/kernel/process.c | 4 ---- arch/powerpc/include/asm/processor.h | 1 - arch/powerpc/kernel/process.c | 5 ----- arch/riscv/include/asm/processor.h | 5 ----- arch/s390/include/asm/processor.h | 3 --- arch/sh/include/asm/processor_32.h | 3 --- arch/sh/kernel/process_32.c | 5 ----- arch/sparc/include/asm/processor_32.h | 3 --- arch/sparc/include/asm/processor_64.h | 3 --- arch/um/include/asm/processor-generic.h | 4 ---- arch/x86/include/asm/processor.h | 3 --- arch/xtensa/include/asm/processor.h | 3 --- include/linux/sched/task.h | 3 +++ kernel/exit.c | 4 ++++ 33 files changed, 7 insertions(+), 118 deletions(-) diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 43e234c518b1c..714abe494e5fd 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -36,8 +36,6 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); /* Free all resources held by a thread. */ struct task_struct; -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e2e25f8b5e76c..dbf1bc5e2ad25 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -225,11 +225,6 @@ flush_thread(void) current_thread_info()->pcb.unique = 0; } -void -release_thread(struct task_struct *dead_task) -{ -} - /* * Copy architecture-specific thread state */ diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 54db9d7bb562d..fb844fce1ab67 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -43,9 +43,6 @@ struct task_struct; #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + (void *)task_stack_page(p)) - 1) -/* Free all resources held by a thread */ -#define release_thread(thread) do { } while (0) - /* * A lot of busy-wait loops in SMP are based off of non-volatile data otherwise * get optimised away by gcc diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index bdc35c0e8dfb9..326864f79d18f 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -81,9 +81,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3d9cace638840..712d3e6d9be90 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -232,10 +232,6 @@ void flush_thread(void) thread_notify(THREAD_NOTIFY_FLUSH, thread); } -void release_thread(struct task_struct *dead_task) -{ -} - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 86eb0bfe3b380..4cfb4cd1d4759 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -323,9 +323,6 @@ static inline bool is_ttbr1_addr(unsigned long addr) /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 92bcc1768f0b9..9015f49c206ef 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -279,10 +279,6 @@ void flush_thread(void) flush_tagged_addr_state(); } -void release_thread(struct task_struct *dead_task) -{ -} - void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 9638206bc44f7..63ad71fab30d7 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -69,11 +69,6 @@ do { \ /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 615f7e49968e6..0cd39c2cdf8f7 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -60,10 +60,6 @@ struct thread_struct { #define KSTK_EIP(tsk) (pt_elr(task_pt_regs(tsk))) #define KSTK_ESP(tsk) (pt_psp(task_pt_regs(tsk))) -/* Free all resources held by a thread; defined in process.c */ -extern void release_thread(struct task_struct *dead_task); - -/* Get wait channel for task P. */ extern unsigned long __get_wchan(struct task_struct *p); /* The following stuff is pretty HEXAGON specific. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index f0552f98a7bae..e15eeaebd7853 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -112,13 +112,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) return 0; } -/* - * Release any architecture-specific resources locked by thread - */ -void release_thread(struct task_struct *dead_task) -{ -} - /* * Some archs flush debug and FPU info here */ diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 757c2f6d8d4b8..d1978e0040548 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -318,13 +318,6 @@ struct thread_struct { struct mm_struct; struct task_struct; -/* - * Free all resources held by a thread. This is called after the - * parent of DEAD_TASK has collected the exit status of the task via - * wait(). - */ -#define release_thread(dead_task) - /* Get wait channel for task P. */ extern unsigned long __get_wchan (struct task_struct *p); diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 1c4b4308378d4..6954dc5d24e9d 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -176,9 +176,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while (0) - enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern unsigned long boot_option_idle_override; diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index d86b4009880b4..7a2da780830b8 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -145,11 +145,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); void show_registers(struct pt_regs *regs); diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 7e9e92670df33..4e193c7550dfa 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -63,11 +63,6 @@ struct thread_struct { .pgdir = swapper_pg_dir, \ } -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 4bb24579d12e4..3fde1ff72bd16 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -344,9 +344,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - /* * Do necessary setup to start up a newly executed thread. */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index b8125dfbcad2d..8916d93d5c2d0 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -64,11 +64,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long pc, struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index aa1699c18add8..ed9efb430afa1 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -72,7 +72,6 @@ struct thread_struct { void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); -void release_thread(struct task_struct *); unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 52dc983ddeba3..f94b5ec06786e 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -125,10 +125,6 @@ void show_regs(struct pt_regs *regs) show_registers(regs); } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Copy the thread-specific (arch specific) info from the current * process to the new one p diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 4621ceb513147..a608970b249af 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -266,9 +266,6 @@ on downward growing arches, it looks like this: struct mm_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs.iaoq[0]) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 7c37e09c92da6..3db0e97e6c066 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -146,10 +146,6 @@ void flush_thread(void) */ } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Idle thread support * diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fdfaae194ddd5..92e332415d02c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -75,7 +75,6 @@ extern int _chrp_type; struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); -void release_thread(struct task_struct *); #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0fbda89cd1bb5..991cda25b9a9e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1655,11 +1655,6 @@ EXPORT_SYMBOL_GPL(set_thread_tidr); #endif /* CONFIG_PPC64 */ -void -release_thread(struct task_struct *t) -{ -} - /* * this gets called so that we can store coprocessor state into memory and * copy the current task into the new thread. diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 19eedd4af4cde..94a0590c69710 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -65,11 +65,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index bd66f8e349492..c52fe651eebab 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -186,9 +186,6 @@ struct pt_regs; void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *tsk) { } - /* Free guarded storage control block */ void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 45240ec6b85a4..27aebf1e75a20 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -127,9 +127,6 @@ struct task_struct; extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned long new_sp); -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - /* * FPU lazy state save handling. */ diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index a808843375e71..92b6649d49295 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -84,11 +84,6 @@ void flush_thread(void) #endif } -void release_thread(struct task_struct *dead_task) -{ - /* do nothing */ -} - asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index b26c35336b51d..ba8b70ffec085 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -80,9 +80,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, : "memory"); } -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while(0) - unsigned long __get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 89850dff6b033..2667f35d5ea56 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -176,9 +176,6 @@ do { \ regs->tstate &= ~TSTATE_PEF; \ } while (0) -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while (0) - unsigned long __get_wchan(struct task_struct *task); #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index d0fc1862da957..bb5f06480da95 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -55,10 +55,6 @@ struct thread_struct { .request = { 0 } \ } -static inline void release_thread(struct task_struct *task) -{ -} - /* * User space process size: 3GB (default). */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 356308c739514..67c9d73b31faa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -587,9 +587,6 @@ static inline void load_sp0(unsigned long sp0) #endif /* CONFIG_PARAVIRT_XXL */ -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); /* diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 76bc63127c66e..5abde43c570cd 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -221,9 +221,6 @@ struct thread_struct { struct task_struct; struct mm_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 81cab4b01edcb..d6c48163c6def 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr) void put_task_struct_rcu_user(struct task_struct *task); +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e3..f4b7b058f4e69 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -183,6 +183,10 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; From cba7543e1515e79a85d37df85a0eb2cf0f07d115 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Fri, 19 Aug 2022 08:18:19 +0000 Subject: [PATCH 16/46] fs/qnx6: delete unnecessary checks before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus remove the tests which are not needed around the shown calls. Link: https://lkml.kernel.org/r/20220819081819.96347-1-chi.minghao@zte.com.cn Signed-off-by: Minghao Chi Reported-by: Zeal Robot Cc: CGEL ZTE Cc: Matthew Wilcox Cc: Minghao Chi Cc: Muchun Song Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/qnx6/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b9895afca9d11..85b2fa3b211c9 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -470,10 +470,8 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) out1: iput(sbi->inodes); out: - if (bh1) - brelse(bh1); - if (bh2) - brelse(bh2); + brelse(bh1); + brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; From aa06a9bd853306c239f759018fb227d7e8f4e203 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Sat, 20 Aug 2022 19:18:13 +0100 Subject: [PATCH 17/46] ia64: fix clock_getres(CLOCK_MONOTONIC) to report ITC frequency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clock_gettime(CLOCK_MONOTONIC, &tp) is very precise on ia64 as it uses ITC (similar to rdtsc on x86). It's not quite a hrtimer as it is a few times slower than 1ns. Usually 2-3ns. clock_getres(CLOCK_MONOTONIC, &res) never reflected that fact and reported 0.04s precision (1/HZ value). In https://bugs.gentoo.org/596382 gstreamer's test suite failed loudly when it noticed precision discrepancy. Before the change: clock_getres(CLOCK_MONOTONIC, &res) reported 250Hz precision. After the change: clock_getres(CLOCK_MONOTONIC, &res) reports ITC (400Mhz) precision. The patch is based on matoro's fix. I added a bit of explanation why we need to special-case arch-specific clock_getres(). [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20220820181813.2275195-1-slyich@gmail.com Signed-off-by: Sergei Trofimovich Cc: matoro Cc: Émeric Maschino Signed-off-by: Andrew Morton --- arch/ia64/kernel/sys_ia64.c | 26 ++++++++++++++++++++++++++ arch/ia64/kernel/syscalls/syscall.tbl | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index e14db25146c22..215bf3f8cb204 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -166,3 +166,29 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u force_successful_syscall_return(); return addr; } + +asmlinkage long +ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) +{ + /* + * ia64's clock_gettime() syscall is implemented as a vdso call + * fsys_clock_gettime(). Currently it handles only + * CLOCK_REALTIME and CLOCK_MONOTONIC. Both are based on + * 'ar.itc' counter which gets incremented at a constant + * frequency. It's usually 400MHz, ~2.5x times slower than CPU + * clock frequency. Which is almost a 1ns hrtimer, but not quite. + * + * Let's special-case these timers to report correct precision + * based on ITC frequency and not HZ frequency for supported + * clocks. + */ + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + return put_timespec64(&rtn_tp, tp); + } + + return sys_clock_getres(which_clock, tp); +} diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 78b1d03e86e1d..72c929d9902b9 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -240,7 +240,7 @@ 228 common timer_delete sys_timer_delete 229 common clock_settime sys_clock_settime 230 common clock_gettime sys_clock_gettime -231 common clock_getres sys_clock_getres +231 common clock_getres ia64_clock_getres 232 common clock_nanosleep sys_clock_nanosleep 233 common fstatfs64 sys_fstatfs64 234 common statfs64 sys_statfs64 From 693fc06e98514c2d5951ead4aca40cf8b21100b1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 19:32:55 +0200 Subject: [PATCH 18/46] epoll: use try_cmpxchg in list_add_tail_lockless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in list_add_tail_lockless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Link: https://lkml.kernel.org/r/20220714173255.12987-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b56b94e2f56f..52954d4637b54 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1065,7 +1065,7 @@ static inline bool list_add_tail_lockless(struct list_head *new, * added to the list from another CPU: the winner observes * new->next == new. */ - if (cmpxchg(&new->next, new, head) != new) + if (!try_cmpxchg(&new->next, &new, head)) return false; /* From b0192296b45232872720d969366449c001ab1f4a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 19:16:53 +0200 Subject: [PATCH 19/46] buffer: use try_cmpxchg in discard_buffer Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in discard_buffer. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Note that the value from *ptr should be read using READ_ONCE to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Link: https://lkml.kernel.org/r/20220714171653.12128-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/buffer.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 55e762a58eb65..2fd98bb7d74ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1464,19 +1464,15 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } From 38ace0d513d9d2556beca4d07102d25e9a73c53c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 18:48:51 +0200 Subject: [PATCH 20/46] aio: use atomic_try_cmpxchg in __get_reqs_available Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in __get_reqs_available. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220714164851.3055-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Benjamin LaHaise Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/aio.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 606613e9d1f4f..5b2ff20ad3229 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx) local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { - int old, avail = atomic_read(&ctx->reqs_available); + int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; - - old = avail; - avail = atomic_cmpxchg(&ctx->reqs_available, - avail, avail - ctx->req_batch); - } while (avail != old); + } while (!atomic_try_cmpxchg(&ctx->reqs_available, + &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } From da3f52ba359590d8eb465ae0d8b6e11d6fd9432f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 21 Aug 2022 21:30:11 +0200 Subject: [PATCH 21/46] iversion: use atomic64_try_cmpxchg) Use atomic64_try_cmpxchg instead of atomic64_cmpxchg (*ptr, old, new) == old in inode_set_max_iversion_raw, inode_maybe_inc_version and inode_query_iversion. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The loop in inode_maybe_inc_iversion improves from: 5563: 48 89 ca mov %rcx,%rdx 5566: 48 89 c8 mov %rcx,%rax 5569: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556d: 48 83 c2 02 add $0x2,%rdx 5571: f0 48 0f b1 16 lock cmpxchg %rdx,(%rsi) 5576: 48 39 c1 cmp %rax,%rcx 5579: 0f 84 85 fc ff ff je 5204 <...> 557f: 48 89 c1 mov %rax,%rcx 5582: eb df jmp 5563 <...> to: 5563: 48 89 c2 mov %rax,%rdx 5566: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556a: 48 83 c2 02 add $0x2,%rdx 556e: f0 48 0f b1 11 lock cmpxchg %rdx,(%rcx) 5573: 0f 84 8b fc ff ff je 5204 <...> 5579: eb e8 jmp 5563 <...> Link: https://lkml.kernel.org/r/20220821193011.88208-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/iversion.h | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/include/linux/iversion.h b/include/linux/iversion.h index 3bfebde5a1a6d..eb5a158101693 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -123,17 +123,12 @@ inode_peek_iversion_raw(const struct inode *inode) static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { - u64 cur, old; + u64 cur = inode_peek_iversion_raw(inode); - cur = inode_peek_iversion_raw(inode); - for (;;) { + do { if (cur > val) break; - old = atomic64_cmpxchg(&inode->i_version, cur, val); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** @@ -197,7 +192,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) static inline bool inode_maybe_inc_iversion(struct inode *inode, bool force) { - u64 cur, old, new; + u64 cur, new; /* * The i_version field is not strictly ordered with any other inode @@ -211,19 +206,14 @@ inode_maybe_inc_iversion(struct inode *inode, bool force) */ smp_mb(); cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } @@ -304,10 +294,10 @@ inode_peek_iversion(const struct inode *inode) static inline u64 inode_query_iversion(struct inode *inode) { - u64 cur, old, new; + u64 cur, new; cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { /* @@ -320,11 +310,7 @@ inode_query_iversion(struct inode *inode) } new = cur | I_VERSION_QUERIED; - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } From 948084f0f6959f602f89f679522b706a72da0285 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:25:19 +0200 Subject: [PATCH 22/46] kexec: replace kmap() with kmap_local_page() kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in kexec_core.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in kexec_core.c. Tested on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821182519.9483-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Acked-by: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5ca4d40c9ec13..ca2743f9c634e 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image, if (result < 0) goto out; - ptr = kmap(page); + ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; @@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image, memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); - kunmap(page); + kunmap_local(ptr); if (result) { result = -EFAULT; goto out; @@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image, goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); - ptr = kmap(page); + ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); @@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); - kunmap(page); + kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; From d75e9a4bccf4e19928534dec797935e650f85b09 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:03:58 +0200 Subject: [PATCH 23/46] hfs: unmap the page in the "fail_page" label Patch series "hfs: Replace kmap() with kmap_local_page()". kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmaps pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/hfs is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/hfs. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Fix a bug due to a page being not unmapped if the code jumps to the "fail_page" label (1/3). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. This patch (of 3): Several paths within hfs_btree_open() jump to the "fail_page" label where put_page() is called while the page is still mapped. Call kunmap() to unmap the page soon before put_page(). Link: https://lkml.kernel.org/r/20220821180400.8198-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220821180400.8198-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Matthew Wilcox ] Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 19017d2961734..56c6782436e9c 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -124,6 +124,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke return tree; fail_page: + kunmap(page); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; From ca0ac8dfd35b218b3c95d3b38c695fbff35d94ca Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:03:59 +0200 Subject: [PATCH 24/46] hfs: replace kmap() with kmap_local_page() in bnode.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in bnode.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bnode.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821180400.8198-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/bnode.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c83fd0e8404d3..2015e42e752a6 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int pagenum; int bytes_read; int bytes_to_read; - void *vaddr; off += node->page_offset; pagenum = off >> PAGE_SHIFT; @@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); - vaddr = kmap_atomic(page); - memcpy(buf + bytes_read, vaddr + off, bytes_to_read); - kunmap_atomic(vaddr); + memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ @@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off += node->page_offset; page = node->page[0]; - memcpy(kmap(page) + off, buf, len); - kunmap(page); + memcpy_to_page(page, off, buf, len); set_page_dirty(page); } @@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off += node->page_offset; page = node->page[0]; - memset(kmap(page) + off, 0, len); - kunmap(page); + memzero_page(page, off, len); set_page_dirty(page); } @@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, src_page = src_node->page[0]; dst_page = dst_node->page[0]; - memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); - kunmap(src_page); - kunmap(dst_page); + memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } @@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) src += node->page_offset; dst += node->page_offset; page = node->page[0]; - ptr = kmap(page); + ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); - kunmap(page); + kunmap_local(ptr); set_page_dirty(page); } @@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_SIZE, (int)tree->node_size)); + memzero_page(*pagep, node->page_offset, + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); From 21490eff121555b123f7088b935e40ee43d2c642 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:04:00 +0200 Subject: [PATCH 25/46] hfs: replace kmap() with kmap_local_page() in btree.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in btree.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in btree.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821180400.8198-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/btree.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 56c6782436e9c..2fa4b1f8cc7fb 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -119,12 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: - kunmap(page); + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; @@ -170,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -181,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } @@ -269,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -282,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -291,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); @@ -314,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -361,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); From e1d7c7609ae0933b59840390c1b207ac0a925c8b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:38:51 +0200 Subject: [PATCH 26/46] bitops: use try_cmpxchg in set_mask_bits and bit_clear_unless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in set_mask_bits and bit_clear_unless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Link: https://lkml.kernel.org/r/20220822143851.3290-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/bitops.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 3b89c64bcfd8f..fb24a513d9172 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -328,10 +328,10 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ new__ = (old__ & ~mask__) | bits__; \ - } while (cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) @@ -343,11 +343,12 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ + if (old__ & test__) \ + break; \ new__ = old__ & ~clear__; \ - } while (!(old__ & test__) && \ - cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) From 88040e67b9533003cfd1a2d61ebd17593435322c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:36 +0200 Subject: [PATCH 27/46] alpha: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818205936.6144-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton --- arch/alpha/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index b4fbbba30aa2b..33bf3a6270027 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -491,9 +491,9 @@ setup_arch(char **cmdline_p) boot flags depending on the boot mode, we need some shorthand. This should do for installation. */ if (strcmp(COMMAND_LINE, "INSTALL") == 0) { - strlcpy(command_line, "root=/dev/fd0 load_ramdisk=1", sizeof command_line); + strscpy(command_line, "root=/dev/fd0 load_ramdisk=1", sizeof(command_line)); } else { - strlcpy(command_line, COMMAND_LINE, sizeof command_line); + strscpy(command_line, COMMAND_LINE, sizeof(command_line)); } strcpy(boot_command_line, command_line); *cmdline_p = command_line; From 216e71f13c13a7b3df352742554445907011a3a5 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:39 +0200 Subject: [PATCH 28/46] ia64: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818205940.6216-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- arch/ia64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index fd6301eafa9d5..c057280442727 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -552,7 +552,7 @@ setup_arch (char **cmdline_p) ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); *cmdline_p = __va(ia64_boot_param->command_line); - strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); + strscpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); efi_init(); io_port_init(); From c97e21fe91ed1d59eb36cac1728bcb9a82167c7a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:13 +0200 Subject: [PATCH 29/46] ocfs2: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210123.7637-4-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stackglue.c | 4 ++-- fs/ocfs2/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index dd77b7aaabf5c..317126261523b 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) - strlcpy(new_conn->cc_cluster_name, cluster_name, + strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e2cc9eec287c9..660bc1795848a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2221,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb, goto out_journal; } - strlcpy(osb->vol_label, di->id2.i_super.s_label, + strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); From 512cb7e4c110133e49da9f69885df3ed41aa284f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:53 +0200 Subject: [PATCH 30/46] reiserfs: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210153.8095-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/reiserfs/procfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 4a7cb16e9345c..3dba8acf4e832 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; From a1d3a6d9f243797d1bcaa0ca14c03396bc302ca6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:59 +0200 Subject: [PATCH 31/46] init: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210200.8203-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- init/do_mounts.c | 4 ++-- init/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 7058e14ad5f70..811e94daf0a84 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(name_to_dev_t); static int __init root_dev_setup(char *line) { - strlcpy(saved_root_name, line, sizeof(saved_root_name)); + strscpy(saved_root_name, line, sizeof(saved_root_name)); return 1; } @@ -343,7 +343,7 @@ static int __init split_fs_names(char *page, size_t size, char *names) int count = 1; char *p = page; - strlcpy(p, root_fs_names, size); + strscpy(p, root_fs_names, size); while (*p++) { if (p[-1] == ',') { p[-1] = '\0'; diff --git a/init/main.c b/init/main.c index 1fe7942f5d4a8..a45f9eca40af0 100644 --- a/init/main.c +++ b/init/main.c @@ -422,7 +422,7 @@ static void __init setup_boot_config(void) if (!data) data = xbc_get_embedded_bootconfig(&size); - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, bootconfig_params); @@ -762,7 +762,7 @@ void __init parse_early_param(void) return; /* All fall through to do_early_param. */ - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_early_options(tmp_cmdline); done = 1; } From 977bbf4385fc64986f22b024858071a35c481a8a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:02:03 +0200 Subject: [PATCH 32/46] lib: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210203.8251-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- lib/earlycpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/earlycpio.c b/lib/earlycpio.c index 7921193f04243..d2c37d64fd0c3 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -126,7 +126,7 @@ struct cpio_data find_cpio_data(const char *path, void *data, "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", p, MAX_CPIO_FILE_NAME); } - strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); + strscpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); cd.data = (void *)dptr; cd.size = ch[C_FILESIZE]; From 5fdfa161b2043001f82cbce49e87e8e9f581d510 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 23 Aug 2022 17:26:32 +0200 Subject: [PATCH 33/46] task_work: use try_cmpxchg in task_work_add, task_work_cancel_match and task_work_run Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in task_work_add, task_work_cancel_match and task_work_run. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The patch avoids extra memory read in case cmpxchg fails. Link: https://lkml.kernel.org/r/20220823152632.4517-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- kernel/task_work.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index dff75bcde1514..065e1ef8fc8d7 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work, /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); + head = READ_ONCE(task->task_works); do { - head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; - } while (cmpxchg(&task->task_works, head, work) != head); + } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: @@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task, * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = READ_ONCE(*pprev))) { - if (!match(work, data)) + work = READ_ONCE(*pprev); + while (work) { + if (!match(work, data)) { pprev = &work->next; - else if (cmpxchg(pprev, work, work->next) == work) + work = READ_ONCE(*pprev); + } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -151,16 +153,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + work = READ_ONCE(task->task_works); do { head = NULL; - work = READ_ONCE(task->task_works); if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } - } while (cmpxchg(&task->task_works, work, head) != work); + } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; From 9a15193e23b780d1da77e3db18698beb0637897d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 25 Aug 2022 16:56:03 +0200 Subject: [PATCH 34/46] smpboot: use atomic_try_cmpxchg in cpu_wait_death and cpu_report_death Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in cpu_wait_death and cpu_report_death. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220825145603.5811-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- kernel/smpboot.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9f54544e7499..2c7396da470c5 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds) /* The outgoing CPU will normally get done quite quickly. */ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state; + goto update_state_early; udelay(5); /* But if the outgoing CPU dawdles, wait increasingly long times. */ @@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds) break; sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); } -update_state: +update_state_early: oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +update_state: if (oldstate == CPU_DEAD) { /* Outgoing CPU died normally, update state. */ smp_mb(); /* atomic_read() before update. */ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); } else { /* Outgoing CPU still hasn't died, set state accordingly. */ - if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, CPU_BROKEN) != oldstate) + if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, CPU_BROKEN)) goto update_state; ret = false; } @@ -475,14 +476,14 @@ bool cpu_report_death(void) int newstate; int cpu = smp_processor_id(); + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); do { - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); if (oldstate != CPU_BROKEN) newstate = CPU_DEAD; else newstate = CPU_DEAD_FROZEN; - } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, newstate) != oldstate); + } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, newstate)); return newstate == CPU_DEAD; } From f81259c6dbcefb255fa473090cd975f3827bca89 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:35 +0800 Subject: [PATCH 35/46] fail_function: switch to memdup_user_nul() helper Use memdup_user_nul() helper instead of open-coding to simplify the code. Link: https://lkml.kernel.org/r/20220826073337.2085798-1-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 60dc825ecc2b3..03643e33e4c33 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, buffer, count)) { - ret = -EFAULT; - goto out_free; - } - buf[count] = '\0'; + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + sym = strstrip(buf); mutex_lock(&fei_lock); @@ -308,7 +304,6 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } out: mutex_unlock(&fei_lock); -out_free: kfree(buf); return ret; } From cef9f5f866ad45a2dd64fed6e6b657043c2c6f17 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:36 +0800 Subject: [PATCH 36/46] fail_function: refactor code of checking return value of register_kprobe() Refactor the error handling of register_kprobe() to improve readability. No functional change. Link: https://lkml.kernel.org/r/20220826073337.2085798-2-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 03643e33e4c33..893e8f9a91189 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -294,14 +294,13 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } ret = register_kprobe(&attr->kp); - if (!ret) - fei_debugfs_add_attr(attr); - if (ret < 0) + if (ret) { fei_attr_remove(attr); - else { - list_add_tail(&attr->list, &fei_attr_list); - ret = count; + goto out; } + fei_debugfs_add_attr(attr); + list_add_tail(&attr->list, &fei_attr_list); + ret = count; out: mutex_unlock(&fei_lock); kfree(buf); From d2e85432a2e0a6f31bd9489800f443228f020ed6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:37 +0800 Subject: [PATCH 37/46] fail_function: fix wrong use of fei_attr_remove() If register_kprobe() fails, the new attr is not added to the list yet, so it should call fei_attr_free() intstead. Link: https://lkml.kernel.org/r/20220826073337.2085798-3-yangyingliang@huawei.com Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework") Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 893e8f9a91189..a7ccd2930c5f4 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -295,7 +295,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = register_kprobe(&attr->kp); if (ret) { - fei_attr_remove(attr); + fei_attr_free(attr); goto out; } fei_debugfs_add_attr(attr); From 199cda13534f4c676d7e4601665e971f4f0582c4 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 27 Aug 2022 15:11:16 +0800 Subject: [PATCH 38/46] initramfs: mark my_inptr as __initdata As my_inptr is only used in __init function unpack_to_rootfs(), mark it as __initdata to allow it be freed after boot. Link: https://lkml.kernel.org/r/20220827071116.83078-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: David Disseldorp Cc: Alexander Viro Cc: Martin Wilck Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 18229cfe8906b..2f5bfb7d76521 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -482,7 +482,7 @@ static long __init flush_buffer(void *bufv, unsigned long len) return origLen; } -static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ +static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */ #include From d85a1bec8e8d552ab13163ca1874dcd82f3d1550 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:34 +0800 Subject: [PATCH 39/46] ntfs: fix use-after-free in ntfs_attr_find() Patch series "ntfs: fix bugs about Attribute", v2. This patchset fixes three bugs relative to Attribute in record: Patch 1 adds a sanity check to ensure that, attrs_offset field in first mft record loading from disk is within bounds. Patch 2 moves the ATTR_RECORD's bounds checking earlier, to avoid dereferencing ATTR_RECORD before checking this ATTR_RECORD is within bounds. Patch 3 adds an overflow checking to avoid possible forever loop in ntfs_attr_find(). Without patch 1 and patch 2, the kernel triggersa KASAN use-after-free detection as reported by Syzkaller. Although one of patch 1 or patch 2 can fix this, we still need both of them. Because patch 1 fixes the root cause, and patch 2 not only fixes the direct cause, but also fixes the potential out-of-bounds bug. This patch (of 3): Syzkaller reported use-after-free read as follows: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Kernel will loads $MFT/$DATA's first mft record in ntfs_read_inode_mount(). Yet the problem is that after loading, kernel doesn't check whether attrs_offset field is a valid value. To be more specific, if attrs_offset field is larger than bytes_allocated field, then it may trigger the out-of-bounds read bug(reported as use-after-free bug) in ntfs_attr_find(), when kernel tries to access the corresponding mft record's attribute. This patch solves it by adding the sanity check between attrs_offset field and bytes_allocated field, after loading the first mft record. Link: https://lkml.kernel.org/r/20220831160935.3409-1-yin31149@gmail.com Link: https://lkml.kernel.org/r/20220831160935.3409-2-yin31149@gmail.com Signed-off-by: Hawkins Jiawei Cc: Anton Altaparmakov Cc: ChenXiaoSong Cc: syzkaller-bugs Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/ntfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index db0f1995aedd1..08c659332e26b 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi) goto err_out; } + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); From 36a4d82dddbbd421d2b8e79e1cab68c8126d5075 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:36 +0800 Subject: [PATCH 40/46] ntfs: fix out-of-bounds read in ntfs_attr_find() Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). To ensure access on these ATTR_RECORDs are within bounds, kernel will do some checking during iteration. The problem is that during checking whether ATTR_RECORD's name is within bounds, kernel will dereferences the ATTR_RECORD name_offset field, before checking this ATTR_RECORD strcture is within bounds. This problem may result out-of-bounds read in ntfs_attr_find(), reported by Syzkaller: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== This patch solves it by moving the ATTR_RECORD strcture's bounds checking earlier, then checking whether ATTR_RECORD's name is within bounds. What's more, this patch also add some comments to improve its maintainability. Link: https://lkml.kernel.org/r/20220831160935.3409-3-yin31149@gmail.com Link: https://lore.kernel.org/all/1636796c-c85e-7f47-e96f-e074fee3c7d3@huawei.com/ Link: https://groups.google.com/g/syzkaller-bugs/c/t_XdeKPGTR4/m/LECAuIGcBgAJ Signed-off-by: chenxiaosong (A) Signed-off-by: Dan Carpenter Signed-off-by: Hawkins Jiawei Reported-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Tested-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Cc: Anton Altaparmakov Cc: syzkaller-bugs Signed-off-by: Andrew Morton --- fs/ntfs/attrib.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 52615e6090e1c..cec4be2a2d239 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -594,11 +594,23 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) + break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) From 63095f4f3af59322bea984a6ae44337439348fe0 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:38 +0800 Subject: [PATCH 41/46] ntfs: check overflow when iterating ATTR_RECORDs Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). Because the ATTR_RECORDs are next to each other, kernel can get the next ATTR_RECORD from end address of current ATTR_RECORD, through current ATTR_RECORD length field. The problem is that during iteration, when kernel calculates the end address of current ATTR_RECORD, kernel may trigger an integer overflow bug in executing `a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))`. This may wrap, leading to a forever iteration on 32bit systems. This patch solves it by adding some checks on calculating end address of current ATTR_RECORD during iteration. Link: https://lkml.kernel.org/r/20220831160935.3409-4-yin31149@gmail.com Link: https://lore.kernel.org/all/20220827105842.GM2030@kadam/ Signed-off-by: Hawkins Jiawei Suggested-by: Dan Carpenter Cc: Anton Altaparmakov Cc: chenxiaosong (A) Cc: syzkaller-bugs Signed-off-by: Andrew Morton --- fs/ntfs/attrib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index cec4be2a2d239..a3865bc4a0c65 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -617,6 +617,14 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, return -ENOENT; if (unlikely(!a->length)) break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + if (a->type != type) continue; /* From 35783ccbe519b33f6652b2d7aafcfc82f10b1a1b Mon Sep 17 00:00:00 2001 From: wuchi Date: Thu, 1 Sep 2022 08:31:21 +0800 Subject: [PATCH 42/46] kernel/profile.c: simplify duplicated code in profile_setup() The code to parse option string "schedule/sleep/kvm" of cmdline in function profile_setup is redundant, so simplify that. Link: https://lkml.kernel.org/r/20220901003121.53597-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Andrew Morton Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- kernel/profile.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index 7ea01ba30e757..8a77769bc4b4c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,43 +59,39 @@ int profile_setup(char *str) static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; + const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel sleep profiling enabled (shift: %u)\n", - prof_shift); + select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel schedule profiling enabled (shift: %u)\n", - prof_shift); + select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel KVM profiling enabled (shift: %u)\n", - prof_shift); + select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } + + if (select) { + if (str[strlen(select)] == ',') + str += strlen(select) + 1; + if (get_option(&str, &par)) + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel %s profiling enabled (shift: %u)\n", + select, prof_shift); + } + return 1; } __setup("profile=", profile_setup); From 7b9e664beb237d90bc600f117668227af5ce53ae Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 20:27:13 +0300 Subject: [PATCH 43/46] asm-generic: make parameter types consistent in _unaligned_be48() There is a convention to use internal kernel types, so replace __u8 by u8. Link: https://lkml.kernel.org/r/20220830172713.43686-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Keith Busch Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/asm-generic/unaligned.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index df30f11b4a460..699650f819706 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -126,7 +126,7 @@ static inline void put_unaligned_le24(const u32 val, void *p) __put_unaligned_le24(val, p); } -static inline void __put_unaligned_be48(const u64 val, __u8 *p) +static inline void __put_unaligned_be48(const u64 val, u8 *p) { *p++ = val >> 40; *p++ = val >> 32; From 8ea0114eda0c1c85f8f01922ac8fc1e489a61129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 2 Sep 2022 13:19:23 +0200 Subject: [PATCH 44/46] checkpatch: handle FILE pointer type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using a "FILE *" type, checkpatch considers this an error: ERROR: need consistent spacing around '*' (ctx:WxV) #32: FILE: f.c:8: +static void a(FILE *const b) ^ Fix this by explicitly defining "FILE" as a common type. This is useful for user space patches. With this patch, we now get: <_>WS( ) <_>IDENT(static) <_>WS( ) <_>DECLARE(void ) <_>FUNC(a) PAREN('(') <_>DECLARE(FILE *const ) <_>IDENT(b) <_>PAREN(')') -> V <_>WS( ) 32 > . static void a(FILE *const b) 32 > EEVVVVVVVTTTTTVNTTTTTTTTTTTTVVV 32 > ______________________________ Link: https://lkml.kernel.org/r/20220902111923.1488671-1-mic@digikod.net Link: https://lore.kernel.org/r/20220902111923.1488671-1-mic@digikod.net Signed-off-by: Mickaël Salaün Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9ff219e0a9d56..18effbe1fe908 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -576,10 +576,14 @@ sub hash_show_words { (?:__)?(?:u|s|be|le)(?:8|16|32|64)| atomic_t )}; +our $typeStdioTypedefs = qr{(?x: + FILE +)}; our $typeTypedefs = qr{(?x: $typeC99Typedefs\b| $typeOtherOSTypedefs\b| - $typeKernelTypedefs\b + $typeKernelTypedefs\b| + $typeStdioTypedefs\b )}; our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; From bfca3dd3d0680fc2fc7f659a152234afbac26e4d Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Thu, 1 Sep 2022 21:44:03 +0200 Subject: [PATCH 45/46] kernel/utsname_sysctl.c: print kernel arch Print the machine hardware name (UTS_MACHINE) in /proc/sys/kernel/arch. This helps people who debug kernel with initramfs with minimal environment (i.e. without coreutils or even busybox) or allow to open sysfs file instead of run 'uname -m' in high level languages. Link: https://lkml.kernel.org/r/20220901194403.3819-1-pvorel@suse.cz Signed-off-by: Petr Vorel Acked-by: Greg Kroah-Hartman Cc: David Sterba Cc: "Eric W . Biederman" Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 5 +++++ kernel/utsname_sysctl.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ee6572b1edada..bbaa851946956 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -65,6 +65,11 @@ combining the following values: 4 s3_beep = ======= +arch +==== + +The machine hardware name, the same output as ``uname -m`` +(e.g. ``x86_64`` or ``aarch64``). auto_msgmni =========== diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ca61d49885b6..7ffdd2cd5ff9b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -73,6 +73,13 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); static struct ctl_table uts_kern_table[] = { + { + .procname = "arch", + .data = init_uts_ns.name.machine, + .maxlen = sizeof(init_uts_ns.name.machine), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, { .procname = "ostype", .data = init_uts_ns.name.sysname, From b814751175470b00969a317bf3192260750f9455 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 3 Sep 2022 21:52:33 +0800 Subject: [PATCH 46/46] latencytop: use the last element of latency_record of system In account_global_scheduler_latency(), when we don't find the matching latency_record we try to select one which is unused in latency_record[MAXLR], but the condition will skip the last one. if (i >= MAXLR-1) Fix that. Link: https://lkml.kernel.org/r/20220903135233.5225-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- kernel/latencytop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 76166df011a4d..781249098cb6e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -112,7 +112,7 @@ static void __sched account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) { - int firstnonnull = MAXLR + 1; + int firstnonnull = MAXLR; int i; /* skip kernel threads for now */ @@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk, } i = firstnonnull; - if (i >= MAXLR - 1) + if (i >= MAXLR) return; /* Allocted a new one: */