From f5b23d6704e478b5a97dbba5df9dea96a9cbf847 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:02 +0200 Subject: [PATCH 01/81] hfsplus: unmap the page in the "fail_page" label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "hfsplus: Replace kmap() with kmap_local_page()". kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/hfsplus is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/hfsplus. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Fix a bug due to a page being not unmapped if the code jumps to the "fail_page" label (1/4). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. This patch (of 4): Several paths within hfs_btree_open() jump to the "fail_page" label where put_page() is called while the page is still mapped. Call kunmap() to unmap the page soon before put_page(). Link: https://lkml.kernel.org/r/20220809203105.26183-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220809203105.26183-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Matthew Wilcox Cc: Fabio M. De Francesco Cc: Jens Axboe Cc: Bart Van Assche Cc: Kees Cook Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 66774f4cb4fd5..3a917a9a4edd5 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -245,6 +245,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) return tree; fail_page: + kunmap(page); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; From 6c3014a67a44f11dc1020c8b47a1d1d626f007a9 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:03 +0200 Subject: [PATCH 02/81] hfsplus: convert kmap() to kmap_local_page() in bnode.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in bnode.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bnode.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/bnode.c | 105 +++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index a5ab00e542203..87974d5e67915 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(buf, kmap(*pagep) + off, l); - kunmap(*pagep); + memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(buf, kmap(*++pagep), l); - kunmap(*pagep); + memcpy_from_page(buf, *++pagep, 0, l); } } @@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(kmap(*pagep) + off, buf, l); + memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++pagep), buf, l); + memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memset(kmap(*pagep) + off, 0, l); + memzero_page(*pagep, off, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memset(kmap(*++pagep), 0, l); + memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); - kunmap(*src_page); + memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); - kunmap(*dst_page); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++dst_page), kmap(*++src_page), l); - kunmap(*src_page); + memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); - kunmap(*dst_page); } } else { void *src_ptr, *dst_ptr; do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; src = 0; @@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } l = min(len, l); memcpy(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page **src_page, **dst_page; + void *src_ptr, *dst_ptr; int l; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); @@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { while (src < len) { - memmove(kmap(*dst_page), kmap(*src_page), src); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr, src_ptr, src); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); len -= src; src = PAGE_SIZE; src_page--; dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, len); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr + src, src_ptr + src, len); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (src < dst) { l = src; src = PAGE_SIZE; @@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (dst == PAGE_SIZE) dst_page--; else @@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, l); - kunmap(*src_page); + + dst_ptr = kmap_local_page(*dst_page) + src; + src_ptr = kmap_local_page(*src_page) + src; + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memmove(kmap(*++dst_page), - kmap(*++src_page), l); - kunmap(*src_page); + dst_ptr = kmap_local_page(*++dst_page); + src_ptr = kmap_local_page(*++src_page); + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; @@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + - node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_SIZE, tree->node_size)); + memzero_page(*pagep, node->page_offset, + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); From f9ef3b95a305874de0b36b7294f6cc8a0e07951e Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:04 +0200 Subject: [PATCH 03/81] hfsplus: convert kmap() to kmap_local_page() in bitmap.c kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in bitmap.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bitmap.c. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/bitmap.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index cebce0cfe3405..bd8dcea855880 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; offset &= ~(PAGE_CACHE_BITS - 1); @@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, } curr++; } - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; if (offset >= size) break; @@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - curr = pptr = kmap(page); + curr = pptr = kmap_local_page(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; else @@ -127,7 +127,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, len -= 32; } set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -135,7 +135,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -151,7 +151,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, done: *curr = cpu_to_be32(n); set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); @@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) page = read_mapping_page(mapping, pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; len = count; @@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) break; set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); page = read_mapping_page(mapping, ++pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -231,7 +231,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) } out: set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); sbi->free_blocks += len; hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); From 9f25f357c557bfea39a2d6a629a6317d2b3dfc64 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:05 +0200 Subject: [PATCH 04/81] hfsplus: convert kmap() to kmap_local_page() in btree.c kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in btree.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in btree.c. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-5-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/btree.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 3a917a9a4edd5..9e1732a2b92a8 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -240,12 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: - kunmap(page); + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; @@ -292,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree) return -EIO; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); @@ -304,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); return 0; @@ -395,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -408,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -418,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); @@ -441,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -491,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; @@ -499,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node) pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); From 765f2bf04fdaced4e7d7e94cfc3f743048629f31 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 8 Aug 2022 10:59:28 +0200 Subject: [PATCH 05/81] scripts/decodecode: improve faulting line determination There are cases where the IP pointer in a Code: line in an oops doesn't point at the beginning of an instruction: Code: 0f bd c2 e9 a0 cd b5 e4 48 0f bd c2 e9 97 cd b5 e4 0f 1f 80 00 00 00 00 \ e9 8b cd b5 e4 0f 1f 00 66 0f a3 d0 e9 7f cd b5 e4 0f 1f <80> 00 00 00 \ 00 0f a3 d0 e9 70 cd b5 e4 48 0f a3 d0 e9 67 cd b5 e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 0f 1f 80 00 00 00 00 nopl 0x0(%rax) ^^ and the current way of determining the faulting instruction line doesn't work because disassembled instructions are counted from the IP byte to the end and when that thing points in the middle, the trailing bytes can be interpreted as different insns: Code starting with the faulting instruction =========================================== 0: 80 00 00 addb $0x0,(%rax) 3: 00 00 add %al,(%rax) whereas, this is part of 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 5: 0f a3 d0 bt %edx,%eax ... leading to: 1d: 0f 1f 00 nopl (%rax) 20: 66 0f a3 d0 bt %dx,%ax 24:* e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 <-- trapping instruction 29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 30: 0f a3 d0 bt %edx,%eax which is the wrong faulting instruction. Change the way the faulting line number is determined by matching the opcode bytes from the beginning, leading to correct output: 1d: 0f 1f 00 nopl (%rax) 20: 66 0f a3 d0 bt %dx,%ax 24: e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 29:* 0f 1f 80 00 00 00 00 nopl 0x0(%rax) <-- trapping instruction 30: 0f a3 d0 bt %edx,%eax While at it, make decodecode use bash as the interpreter - that thing should be present on everything by now. It simplifies the code a lot too. Link: https://lkml.kernel.org/r/20220808085928.29840-1-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Andrew Morton --- scripts/decodecode | 120 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 15 deletions(-) diff --git a/scripts/decodecode b/scripts/decodecode index c711a196511c6..b28fd26865617 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Disassemble the Code: line in Linux oopses # usage: decodecode < oops.file @@ -8,6 +8,8 @@ # AFLAGS=--32 decodecode < 386.oops # PC=hex - the PC (program counter) the oops points to +faultlinenum=1 + cleanup() { rm -f $T $T.s $T.o $T.oo $T.aa $T.dis exit 1 @@ -102,28 +104,125 @@ disas() { grep -v "/tmp\|Disassembly\|\.text\|^$" > $t.dis 2>&1 } +# Match the maximum number of opcode bytes from @op_bytes contained within +# @opline +# +# Params: +# @op_bytes: The string of bytes from the Code: line +# @opline: The disassembled line coming from objdump +# +# Returns: +# The max number of opcode bytes from the beginning of @op_bytes which match +# the opcode bytes in the objdump line. +get_substr_opcode_bytes_num() +{ + local op_bytes=$1 + local opline=$2 + + local retval=0 + substr="" + + for opc in $op_bytes; + do + substr+="$opc" + + # return if opcode bytes do not match @opline anymore + if ! echo $opline | grep -q "$substr"; + then + break + fi + + # add trailing space + substr+=" " + retval=$((retval+1)) + done + + return $retval +} + +# Return the line number in objdump output to where the IP marker in the Code: +# line points to +# +# Params: +# @all_code: code in bytes without the marker +# @dis_file: disassembled file +# @ip_byte: The byte to which the IP points to +get_faultlinenum() +{ + local all_code="$1" + local dis_file="$2" + + # num bytes including IP byte + local num_bytes_ip=$(( $3 + 1 * $width )) + + # Add the two header lines (we're counting from 1). + local retval=3 + + # remove marker + all_code=$(echo $all_code | sed -e 's/[<>()]//g') + + while read line + do + get_substr_opcode_bytes_num "$all_code" "$line" + ate_opcodes=$? + + if ! (( $ate_opcodes )); then + continue + fi + + num_bytes_ip=$((num_bytes_ip - ($ate_opcodes * $width) )) + if (( $num_bytes_ip <= 0 )); then + break + fi + + # Delete matched opcode bytes from all_code. For that, compute + # how many chars those opcodes are represented by and include + # trailing space. + # + # a byte is 2 chars, ate_opcodes is also the number of trailing + # spaces + del_chars=$(( ($ate_opcodes * $width * 2) + $ate_opcodes )) + + all_code=$(echo $all_code | sed -e "s!^.\{$del_chars\}!!") + + let "retval+=1" + + done < $dis_file + + return $retval +} + marker=`expr index "$code" "\<"` if [ $marker -eq 0 ]; then marker=`expr index "$code" "\("` fi - touch $T.oo if [ $marker -ne 0 ]; then - # 2 opcode bytes and a single space - pc_sub=$(( $marker / 3 )) + # How many bytes to subtract from the program counter + # in order to get to the beginning virtual address of the + # Code: + pc_sub=$(( (($marker - 1) / (2 * $width + 1)) * $width )) echo All code >> $T.oo echo ======== >> $T.oo beforemark=`echo "$code"` echo -n " .$type 0x" > $T.s + echo $beforemark | sed -e 's/ /,0x/g; s/[<>()]//g' >> $T.s + disas $T $pc_sub + cat $T.dis >> $T.oo - rm -f $T.o $T.s $T.dis -# and fix code at-and-after marker + get_faultlinenum "$code" "$T.dis" $pc_sub + faultlinenum=$? + + # and fix code at-and-after marker code=`echo "$code" | cut -c$((${marker} + 1))-` + + rm -f $T.o $T.s $T.dis fi + echo Code starting with the faulting instruction > $T.aa echo =========================================== >> $T.aa code=`echo $code | sed -e 's/\r//;s/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` @@ -132,15 +231,6 @@ echo $code >> $T.s disas $T 0 cat $T.dis >> $T.aa -# (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3, -# i.e. the title + the "===..=" line (sed is counting from 1, 0 address is -# special) -faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \ - $(wc -l $T.aa | cut -d" " -f1) + 3)) - -faultline=`cat $T.dis | head -1 | cut -d":" -f2-` -faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'` - cat $T.oo | sed -e "${faultlinenum}s/^\([^:]*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/" echo cat $T.aa From 58b5c203360799e181325f3f8ce212de80ebf304 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Fri, 5 Aug 2022 13:57:33 +0200 Subject: [PATCH 06/81] ipc/util.c: cleanup and improve sysvipc_find_ipc() sysvipc_find_ipc() can be simplified further: - It uses a for() loop to locate the next entry in the idr. This can be replaced with idr_get_next(). - It receives two parameters (pos - which is actually an idr index and not a position, and new_pos, which is really a position). One parameter is sufficient. Link: https://lore.kernel.org/all/20210903052020.3265-3-manfred@colorfullife.com/ Link: https://lkml.kernel.org/r/20220805115733.104763-1-manfred@colorfullife.com Signed-off-by: Manfred Spraul Acked-by: Davidlohr Bueso Acked-by: Waiman Long Cc: "Eric W . Biederman" Cc: <1vier1@web.de> Signed-off-by: Andrew Morton --- ipc/util.c | 53 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index a2208d0f26b2d..05cb9de667350 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -782,28 +782,37 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) return iter->pid_ns; } -/* - * This routine locks the ipc structure found at least at position pos. +/** + * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos + * @ids: ipc identifier set + * @pos: expected position + * + * The function finds an ipc structure, based on the sequence file + * position @pos. If there is no ipc structure at position @pos, then + * the successor is selected. + * If a structure is found, then it is locked (both rcu_read_lock() and + * ipc_lock_object()) and @pos is set to the position needed to locate + * the found ipc structure. + * If nothing is found (i.e. EOF), @pos is not modified. + * + * The function returns the found ipc structure, or NULL at EOF. */ -static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, - loff_t *new_pos) +static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { - struct kern_ipc_perm *ipc = NULL; - int max_idx = ipc_get_maxidx(ids); + int tmpidx; + struct kern_ipc_perm *ipc; - if (max_idx == -1 || pos > max_idx) - goto out; + /* convert from position to idr index -> "-1" */ + tmpidx = *pos - 1; - for (; pos <= max_idx; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - rcu_read_lock(); - ipc_lock_object(ipc); - break; - } + ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); + if (ipc != NULL) { + rcu_read_lock(); + ipc_lock_object(ipc); + + /* convert from idr index to position -> "+1" */ + *pos = tmpidx + 1; } -out: - *new_pos = pos + 1; return ipc; } @@ -817,11 +826,13 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); - return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos); + /* Next -> search for *pos+1 */ + (*pos)++; + return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* - * File positions: pos 0 -> header, pos n -> ipc id = n - 1. + * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) @@ -846,8 +857,8 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - /* Find the (pos-1)th ipc */ - return sysvipc_find_ipc(ids, *pos - 1, pos); + /* Otherwise return the correct ipc structure */ + return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) From 64367f2e4f11cfdd983c637d219fb364ab85558c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Aug 2022 13:44:34 +0200 Subject: [PATCH 07/81] treewide: defconfig: address renamed CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO is now implicitly selected if one picks one of the explicit options that could be DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT, DEBUG_INFO_DWARF4, DEBUG_INFO_DWARF5. This was actually not what I had in mind when I suggested making it a 'choice' statement, but it's too late to change again now, and the Kconfig logic is more sensible in the new form. Change any defconfig file that had CONFIG_DEBUG_INFO enabled but did not pick DWARF4 or DWARF5 explicitly to now pick the toolchain default. Link: https://lkml.kernel.org/r/20220811114609.2097335-1-arnd@kernel.org Fixes: f9b3cd245784 ("Kconfig.debug: make DEBUG_INFO selectable from a choice") Signed-off-by: Arnd Bergmann Reviewed-by: Kees Cook Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Vineet Gupta Cc: Michal Simek Cc: Thomas Bogendoerfer Cc: Dinh Nguyen Cc: Yoshinori Sato Cc: Rich Felker Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: Chris Zankel Cc: Max Filippov Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/alpha/configs/defconfig | 2 +- arch/arc/configs/tb10x_defconfig | 2 +- arch/microblaze/configs/mmu_defconfig | 2 +- arch/mips/configs/bcm47xx_defconfig | 2 +- arch/mips/configs/cavium_octeon_defconfig | 2 +- arch/mips/configs/ci20_defconfig | 2 +- arch/mips/configs/cu1000-neo_defconfig | 2 +- arch/mips/configs/cu1830-neo_defconfig | 2 +- arch/mips/configs/generic_defconfig | 2 +- arch/mips/configs/omega2p_defconfig | 2 +- arch/mips/configs/qi_lb60_defconfig | 2 +- arch/mips/configs/vocore2_defconfig | 2 +- arch/nios2/configs/10m50_defconfig | 2 +- arch/nios2/configs/3c120_defconfig | 2 +- arch/sh/configs/apsh4a3a_defconfig | 2 +- arch/sh/configs/apsh4ad0a_defconfig | 2 +- arch/sh/configs/edosk7760_defconfig | 2 +- arch/sh/configs/magicpanelr2_defconfig | 2 +- arch/sh/configs/polaris_defconfig | 2 +- arch/sh/configs/r7780mp_defconfig | 2 +- arch/sh/configs/r7785rp_defconfig | 2 +- arch/sh/configs/rsk7203_defconfig | 2 +- arch/sh/configs/sdk7780_defconfig | 2 +- arch/sh/configs/se7712_defconfig | 2 +- arch/sh/configs/se7721_defconfig | 2 +- arch/sh/configs/sh2007_defconfig | 2 +- arch/sh/configs/sh7757lcr_defconfig | 2 +- arch/sh/configs/sh7785lcr_32bit_defconfig | 2 +- arch/sh/configs/urquell_defconfig | 2 +- arch/um/configs/i386_defconfig | 2 +- arch/um/configs/x86_64_defconfig | 2 +- arch/xtensa/configs/audio_kc705_defconfig | 2 +- arch/xtensa/configs/cadence_csp_defconfig | 2 +- arch/xtensa/configs/generic_kc705_defconfig | 2 +- arch/xtensa/configs/nommu_kc705_defconfig | 2 +- arch/xtensa/configs/smp_lx200_defconfig | 2 +- arch/xtensa/configs/virt_defconfig | 2 +- arch/xtensa/configs/xip_kc705_defconfig | 2 +- 38 files changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 7e93369308800..6a39fe8ce9e5f 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -65,7 +65,7 @@ CONFIG_NFSD=m CONFIG_NLS_CODEPAGE_437=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_ALPHA_LEGACY_START_ADDRESS=y CONFIG_MATHEMU=y CONFIG_CRYPTO_HMAC=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index d93b65008d4af..4a94d1684ed60 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -90,7 +90,7 @@ CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_HEADERS_INSTALL=y diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 51337fffb9473..8150daf04a76c 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -83,7 +83,7 @@ CONFIG_CIFS=y CONFIG_CIFS_STATS2=y CONFIG_ENCRYPTED_KEYS=y CONFIG_DMA_CMA=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KGDB=y CONFIG_KGDB_TESTS=y CONFIG_KGDB_KDB=y diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index 91ce75edbfb46..22ffde722bb9e 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -72,7 +72,7 @@ CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_CRC32_SARWATE=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index 97ceaf080c0c7..7f021a327566b 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -162,7 +162,7 @@ CONFIG_CRYPTO_SHA1_OCTEON=m CONFIG_CRYPTO_SHA256_OCTEON=m CONFIG_CRYPTO_SHA512_OCTEON=m CONFIG_CRYPTO_DES=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index cc69b215854ea..955b6ac581ab7 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -199,7 +199,7 @@ CONFIG_NLS_UTF8=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=32 CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index 5bd55eb32fe55..1cbc9302e1d14 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -113,7 +113,7 @@ CONFIG_PRINTK_TIME=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cu1830-neo_defconfig b/arch/mips/configs/cu1830-neo_defconfig index cc69688962e8c..a0f73f3cd6ce7 100644 --- a/arch/mips/configs/cu1830-neo_defconfig +++ b/arch/mips/configs/cu1830-neo_defconfig @@ -116,7 +116,7 @@ CONFIG_PRINTK_TIME=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig index 714169e411cf0..bbc0d9b1c3983 100644 --- a/arch/mips/configs/generic_defconfig +++ b/arch/mips/configs/generic_defconfig @@ -83,7 +83,7 @@ CONFIG_ROOT_NFS=y # CONFIG_XZ_DEC_ARMTHUMB is not set # CONFIG_XZ_DEC_SPARC is not set CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/omega2p_defconfig b/arch/mips/configs/omega2p_defconfig index fc39ddf610a9b..6a5cb2d6de6b0 100644 --- a/arch/mips/configs/omega2p_defconfig +++ b/arch/mips/configs/omega2p_defconfig @@ -116,7 +116,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRC16=y CONFIG_XZ_DEC=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig index b4448d0876d57..7e5d9741bd5dd 100644 --- a/arch/mips/configs/qi_lb60_defconfig +++ b/arch/mips/configs/qi_lb60_defconfig @@ -166,7 +166,7 @@ CONFIG_NLS_UTF8=y CONFIG_FONTS=y CONFIG_FONT_SUN8x16=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_READABLE_ASM=y CONFIG_KGDB=y diff --git a/arch/mips/configs/vocore2_defconfig b/arch/mips/configs/vocore2_defconfig index a14f8ea5c3867..302cab9bd7bd3 100644 --- a/arch/mips/configs/vocore2_defconfig +++ b/arch/mips/configs/vocore2_defconfig @@ -116,7 +116,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRC16=y CONFIG_XZ_DEC=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig index a7967b4cfb6ed..91c3fce4dc7fe 100644 --- a/arch/nios2/configs/10m50_defconfig +++ b/arch/nios2/configs/10m50_defconfig @@ -74,4 +74,4 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/arch/nios2/configs/3c120_defconfig b/arch/nios2/configs/3c120_defconfig index 423a0c40a1627..c42ad7e162a36 100644 --- a/arch/nios2/configs/3c120_defconfig +++ b/arch/nios2/configs/3c120_defconfig @@ -71,4 +71,4 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/arch/sh/configs/apsh4a3a_defconfig b/arch/sh/configs/apsh4a3a_defconfig index 530498f189904..99931a13a74da 100644 --- a/arch/sh/configs/apsh4a3a_defconfig +++ b/arch/sh/configs/apsh4a3a_defconfig @@ -85,7 +85,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set # CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FTRACE is not set # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 6abd9bd701061..d9fb124bf015a 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -116,7 +116,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_VM=y CONFIG_DWARF_UNWINDER=y # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig index d77f54e906fd0..f427a95bcd21e 100644 --- a/arch/sh/configs/edosk7760_defconfig +++ b/arch/sh/configs/edosk7760_defconfig @@ -107,7 +107,7 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_CRYPTO=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig index 0989ed9295408..ef1d98e35c91f 100644 --- a/arch/sh/configs/magicpanelr2_defconfig +++ b/arch/sh/configs/magicpanelr2_defconfig @@ -84,7 +84,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set CONFIG_DEBUG_KOBJECT=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_CRC_CCITT=m CONFIG_CRC16=m diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig index 246408ec7462b..f42e4867ddc1a 100644 --- a/arch/sh/configs/polaris_defconfig +++ b/arch/sh/configs/polaris_defconfig @@ -79,5 +79,5 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_SG=y diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index f823cc6b18f91..e527cd60a1910 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -101,7 +101,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_PREEMPT is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index f96bc20d4b1ac..a3f952a83d970 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -96,7 +96,7 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_SH_STANDARD_BIOS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_4KSTACKS=y diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 5a54e2b883f0a..d00fafc021e1a 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -112,7 +112,7 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig index 7d6d323598480..41cb588ca99cb 100644 --- a/arch/sh/configs/sdk7780_defconfig +++ b/arch/sh/configs/sdk7780_defconfig @@ -131,7 +131,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_SH_STANDARD_BIOS=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index ee6d28ae08deb..36356223d51c8 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -93,7 +93,7 @@ CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index bad921bc10f8f..46c5a263a2392 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -121,7 +121,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_932=y CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRC_CCITT=y diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig index 79f02f1c0dc83..259c69e3fa227 100644 --- a/arch/sh/configs/sh2007_defconfig +++ b/arch/sh/configs/sh2007_defconfig @@ -159,7 +159,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_SH_STANDARD_BIOS=y CONFIG_CRYPTO_NULL=y diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig index a2700ab165afb..2579dc4bc0c8f 100644 --- a/arch/sh/configs/sh7757lcr_defconfig +++ b/arch/sh/configs/sh7757lcr_defconfig @@ -80,6 +80,6 @@ CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FTRACE is not set # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig index 7eb3c10f28ad2..781ff13227fc9 100644 --- a/arch/sh/configs/sh7785lcr_32bit_defconfig +++ b/arch/sh/configs/sh7785lcr_32bit_defconfig @@ -141,7 +141,7 @@ CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_LATENCYTOP=y # CONFIG_FTRACE is not set CONFIG_CRYPTO_HMAC=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index cb2f56468fe02..ea773c764c5a8 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -139,7 +139,7 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y # CONFIG_FTRACE is not set # CONFIG_DUMP_CODE is not set diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index fb51bd206dbed..c0162286d68b7 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -69,5 +69,5 @@ CONFIG_JOLIET=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 477b873174243..bec6e5d956873 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -67,6 +67,6 @@ CONFIG_JOLIET=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_KERNEL=y diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig index 3be62da8089b8..ef0ebcfbccf91 100644 --- a/arch/xtensa/configs/audio_kc705_defconfig +++ b/arch/xtensa/configs/audio_kc705_defconfig @@ -120,7 +120,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig index fc240737b14de..2665962d247a7 100644 --- a/arch/xtensa/configs/cadence_csp_defconfig +++ b/arch/xtensa/configs/cadence_csp_defconfig @@ -100,7 +100,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig index e9d6b6f6eca11..236c7f23cc10a 100644 --- a/arch/xtensa/configs/generic_kc705_defconfig +++ b/arch/xtensa/configs/generic_kc705_defconfig @@ -107,7 +107,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index fcb620ef37997..8263da9e078d7 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -105,7 +105,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FRAME_POINTER is not set CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_VM=y diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig index a47c85638ec11..7bdffa3a69c60 100644 --- a/arch/xtensa/configs/smp_lx200_defconfig +++ b/arch/xtensa/configs/smp_lx200_defconfig @@ -111,7 +111,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_VM=y CONFIG_LOCKUP_DETECTOR=y diff --git a/arch/xtensa/configs/virt_defconfig b/arch/xtensa/configs/virt_defconfig index 6d1387dfa96fc..98acb7191cb77 100644 --- a/arch/xtensa/configs/virt_defconfig +++ b/arch/xtensa/configs/virt_defconfig @@ -97,7 +97,7 @@ CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_FONTS=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig index 062148e171351..1c3cebaaa71ba 100644 --- a/arch/xtensa/configs/xip_kc705_defconfig +++ b/arch/xtensa/configs/xip_kc705_defconfig @@ -102,7 +102,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set From 5bb6ce3aeb02497668a4e8971268b041aa61de8d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 1 Aug 2022 14:27:09 +0200 Subject: [PATCH 08/81] fs/isofs: replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). Tasks can be preempted and, when scheduled to run again, the kernel virtual addresses are restored and still valid. It is faster than kmap() in kernels with HIGHMEM enabled. Since kmap_local_page() can be safely used in compress.c, it should be called everywhere instead of kmap(). Therefore, replace kmap() with kmap_local_page() in compress.c. Where it is needed, use memzero_page() instead of open coding kmap_local_page() plus memset() to fill the pages with zeros. Delete the redundant flush_dcache_page() in the two call sites of memzero_page(). Tested with mkisofs on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220801122709.8164-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Roman Gushchin Cc: Pali Rohár Cc: Muchun Song Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/isofs/compress.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index b466172eec25b..09285e82eb08c 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_SIZE); - flush_dcache_page(pages[i]); + memzero_page(pages[i], 0, PAGE_SIZE); SetPageUptodate(pages[i]); } return ((loff_t)pcount) << PAGE_SHIFT; @@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, zerr != Z_STREAM_END) { if (!stream.avail_out) { if (pages[curpage]) { - stream.next_out = page_address(pages[curpage]) + stream.next_out = kmap_local_page(pages[curpage]) + poffset; stream.avail_out = PAGE_SIZE - poffset; poffset = 0; @@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, flush_dcache_page(pages[curpage]); SetPageUptodate(pages[curpage]); } + if (stream.next_out != (unsigned char *)zisofs_sink_page) { + kunmap_local(stream.next_out); + stream.next_out = NULL; + } curpage++; } if (!stream.avail_in) @@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, } inflate_out: zlib_inflateEnd(&stream); + if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page) + kunmap_local(stream.next_out); z_eio: mutex_unlock(&zisofs_zlib_lock); @@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, } if (poffset && *pages) { - memset(page_address(*pages) + poffset, 0, - PAGE_SIZE - poffset); - flush_dcache_page(*pages); + memzero_page(*pages, poffset, PAGE_SIZE - poffset); SetPageUptodate(*pages); } return 0; @@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) { + if (pages[i]) ClearPageError(pages[i]); - kmap(pages[i]); - } } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) flush_dcache_page(pages[i]); if (i == full_page && err) SetPageError(pages[i]); - kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); From defdaff15a84c68521c5f02b157fc8541e0356f3 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sat, 13 Aug 2022 15:00:34 -0700 Subject: [PATCH 09/81] checkpatch: add kmap and kmap_atomic to the deprecated list kmap() and kmap_atomic() are being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. kmap_local_page() is safe from any context and is therefore redundant with kmap_atomic() with the exception of any pagefault or preemption disable requirements. However, using kmap_atomic() for these side effects makes the code less clear. So any requirement for pagefault or preemption disable should be made explicitly. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored. Link: https://lkml.kernel.org/r/20220813220034.806698-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Suggested-by: Thomas Gleixner Suggested-by: Fabio M. De Francesco Reviewed-by: Chaitanya Kulkarni Cc: Joe Perches Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 79e759aac543b..9ff219e0a9d56 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -807,6 +807,8 @@ sub find_standard_signature { "rcu_barrier_sched" => "rcu_barrier", "get_state_synchronize_sched" => "get_state_synchronize_rcu", "cond_synchronize_sched" => "cond_synchronize_rcu", + "kmap" => "kmap_local_page", + "kmap_atomic" => "kmap_local_page", ); #Create a search pattern for all these strings to speed up a loop below From 9847f21225c4eb0b843cb2b72ed83b32edb1e6f2 Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Thu, 28 Jul 2022 16:24:34 -0700 Subject: [PATCH 10/81] lib/cmdline: avoid page fault in next_arg An argument list like "arg=val arg2 \"" can trigger a page fault if the page pointed by 'args[0xffffffff]' is not mapped and potential memory corruption otherwise (unlikely but possible if the bogus address is mapped and contents happen to match the ascii value of the quote character). The fix is to ensure that we load 'args[i-1]' only when (i > 0). Prior to this commit the following command would trigger an unhandled page fault in the kernel: root@(none):/linus/fs/fat# insmod ./fat.ko "foo=bar \"" [ 33.870507] BUG: unable to handle page fault for address: ffff888204252608 [ 33.872180] #PF: supervisor read access in kernel mode [ 33.873414] #PF: error_code(0x0000) - not-present page [ 33.874650] PGD 4401067 P4D 4401067 PUD 0 [ 33.875321] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [ 33.876113] CPU: 16 PID: 399 Comm: insmod Not tainted 5.19.0-dbg-DEV #4 [ 33.877193] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014 [ 33.878739] RIP: 0010:next_arg+0xd1/0x110 [ 33.879399] Code: 22 75 1d 41 c6 04 01 00 41 80 f8 22 74 18 eb 35 4c 89 0e 45 31 d2 4c 89 cf 48 c7 02 00 00 00 00 41 80 f8 22 75 1f 41 8d 42 ff <41> 80 3c 01 22 75 14 41 c6 04 01 00 eb 0d 48 c7 02 00 00 00 00 41 [ 33.882338] RSP: 0018:ffffc90001253d08 EFLAGS: 00010246 [ 33.883174] RAX: 00000000ffffffff RBX: ffff888104252608 RCX: 0fc317bba1c1dd00 [ 33.884311] RDX: ffffc90001253d40 RSI: ffffc90001253d48 RDI: ffff888104252609 [ 33.885450] RBP: ffffc90001253d10 R08: 0000000000000022 R09: ffff888104252609 [ 33.886595] R10: 0000000000000000 R11: ffffffff82c7ff20 R12: 0000000000000282 [ 33.887748] R13: 00000000ffff8000 R14: 0000000000000000 R15: 0000000000007fff [ 33.888887] FS: 00007f04ec7432c0(0000) GS:ffff88813d300000(0000) knlGS:0000000000000000 [ 33.890183] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.891111] CR2: ffff888204252608 CR3: 0000000100f36005 CR4: 0000000000170ee0 [ 33.892241] Call Trace: [ 33.892641] [ 33.892989] parse_args+0x8f/0x220 [ 33.893538] load_module+0x138b/0x15a0 [ 33.894149] ? prepare_coming_module+0x50/0x50 [ 33.894879] ? kernel_read_file_from_fd+0x5f/0x90 [ 33.895639] __se_sys_finit_module+0xce/0x130 [ 33.896342] __x64_sys_finit_module+0x1d/0x20 [ 33.897042] do_syscall_64+0x44/0xa0 [ 33.897622] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 33.898434] RIP: 0033:0x7f04ec85ef79 [ 33.899009] Code: 48 8d 3d da db 0d 00 0f 05 eb a5 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c7 9e 0d 00 f7 d8 64 89 01 48 [ 33.901912] RSP: 002b:00007fffae81bfe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 33.903081] RAX: ffffffffffffffda RBX: 0000559c5f1d2640 RCX: 00007f04ec85ef79 [ 33.904191] RDX: 0000000000000000 RSI: 0000559c5f1d12a0 RDI: 0000000000000003 [ 33.905304] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 33.906421] R10: 0000000000000003 R11: 0000000000000246 R12: 0000559c5f1d12a0 [ 33.907526] R13: 0000000000000000 R14: 0000559c5f1d25f0 R15: 0000559c5f1d12a0 [ 33.908631] [ 33.908986] Modules linked in: fat(+) [last unloaded: fat] [ 33.909843] CR2: ffff888204252608 [ 33.910375] ---[ end trace 0000000000000000 ]--- [ 33.911172] RIP: 0010:next_arg+0xd1/0x110 [ 33.911796] Code: 22 75 1d 41 c6 04 01 00 41 80 f8 22 74 18 eb 35 4c 89 0e 45 31 d2 4c 89 cf 48 c7 02 00 00 00 00 41 80 f8 22 75 1f 41 8d 42 ff <41> 80 3c 01 22 75 14 41 c6 04 01 00 eb 0d 48 c7 02 00 00 00 00 41 [ 33.914643] RSP: 0018:ffffc90001253d08 EFLAGS: 00010246 [ 33.915446] RAX: 00000000ffffffff RBX: ffff888104252608 RCX: 0fc317bba1c1dd00 [ 33.916544] RDX: ffffc90001253d40 RSI: ffffc90001253d48 RDI: ffff888104252609 [ 33.917636] RBP: ffffc90001253d10 R08: 0000000000000022 R09: ffff888104252609 [ 33.918727] R10: 0000000000000000 R11: ffffffff82c7ff20 R12: 0000000000000282 [ 33.919821] R13: 00000000ffff8000 R14: 0000000000000000 R15: 0000000000007fff [ 33.920908] FS: 00007f04ec7432c0(0000) GS:ffff88813d300000(0000) knlGS:0000000000000000 [ 33.922125] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.923017] CR2: ffff888204252608 CR3: 0000000100f36005 CR4: 0000000000170ee0 [ 33.924098] Kernel panic - not syncing: Fatal exception [ 33.925776] Kernel Offset: disabled [ 33.926347] Rebooting in 10 seconds.. Link: https://lkml.kernel.org/r/20220728232434.1666488-1-neelnatu@google.com Signed-off-by: Neel Natu Reviewed-by: Eric Dumazet Signed-off-by: Andrew Morton --- lib/cmdline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index 5546bf5887806..90ed997d95701 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -260,7 +260,7 @@ char *next_arg(char *args, char **param, char **val) args[i-1] = '\0'; } } - if (quoted && args[i-1] == '"') + if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { From 7bb5da0d490b2d836c5218f5186ee588d2145310 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:57 +0100 Subject: [PATCH 11/81] kexec: turn all kexec_mutex acquisitions into trylocks Patch series "kexec, panic: Making crash_kexec() NMI safe", v4. This patch (of 2): Most acquistions of kexec_mutex are done via mutex_trylock() - those were a direct "translation" from: 8c5a1cf0ad3a ("kexec: use a mutex for locking rather than xchg()") there have however been two additions since then that use mutex_lock(): crash_get_memory_size() and crash_shrink_memory(). A later commit will replace said mutex with an atomic variable, and locking operations will become atomic_cmpxchg(). Rather than having those mutex_lock() become while (atomic_cmpxchg(&lock, 0, 1)), turn them into trylocks that can return -EBUSY on acquisition failure. This does halve the printable size of the crash kernel, but that's still neighbouring 2G for 32bit kernels which should be ample enough. Link: https://lkml.kernel.org/r/20220630223258.4144112-1-vschneid@redhat.com Link: https://lkml.kernel.org/r/20220630223258.4144112-2-vschneid@redhat.com Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Baoquan He Signed-off-by: Andrew Morton --- include/linux/kexec.h | 2 +- kernel/kexec_core.c | 12 ++++++++---- kernel/ksysfs.c | 7 ++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 13e6c4b58f07d..41a686996aaa3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -427,7 +427,7 @@ extern int kexec_load_disabled; extern bool kexec_in_progress; int crash_shrink_memory(unsigned long new_size); -size_t crash_get_memory_size(void); +ssize_t crash_get_memory_size(void); #ifndef arch_kexec_protect_crashkres /* diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index acd029b307e42..8fa4eeb95b305 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1004,13 +1004,16 @@ void crash_kexec(struct pt_regs *regs) } } -size_t crash_get_memory_size(void) +ssize_t crash_get_memory_size(void) { - size_t size = 0; + ssize_t size = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; - mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); + mutex_unlock(&kexec_mutex); return size; } @@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - mutex_lock(&kexec_mutex); + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; if (kexec_crash_image) { ret = -ENOENT; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b1292a57c2a53..65dba9076f312 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%zu\n", crash_get_memory_size()); + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sprintf(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, From 05c6257433b7212f07a7e53479a8ab038fc1666a Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:58 +0100 Subject: [PATCH 12/81] panic, kexec: make __crash_kexec() NMI safe Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI panic() doesn't work. The cause of that lies in the PREEMPT_RT definition of mutex_trylock(): if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; This prevents an nmi_panic() from executing the main body of __crash_kexec() which does the actual kexec into the kdump kernel. The warning and return are explained by: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") [...] The reasons for this are: 1) There is a potential deadlock in the slowpath 2) Another cpu which blocks on the rtmutex will boost the task which allegedly locked the rtmutex, but that cannot work because the hard/softirq context borrows the task context. Furthermore, grabbing the lock isn't NMI safe, so do away with kexec_mutex and replace it with an atomic variable. This is somewhat overzealous as *some* callsites could keep using a mutex (e.g. the sysfs-facing ones like crash_shrink_memory()), but this has the benefit of involving a single unified lock and preventing any future NMI-related surprises. Tested by triggering NMI panics via: $ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi $ echo 1 > /proc/sys/kernel/unknown_nmi_panic $ echo 1 > /proc/sys/kernel/panic $ ipmitool power diag Link: https://lkml.kernel.org/r/20220630223258.4144112-3-vschneid@redhat.com Fixes: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: Baoquan He Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec.c | 11 ++++------- kernel/kexec_core.c | 20 ++++++++++---------- kernel/kexec_file.c | 4 ++-- kernel/kexec_internal.h | 15 ++++++++++++++- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index b5e40f0697681..cb8e6e6f983c7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, /* * Because we write directly to the reserved memory region when loading - * crash kernels we need a mutex here to prevent multiple crash kernels - * from attempting to load simultaneously, and to prevent a crash kernel - * from loading over the top of a in use crash kernel. - * - * KISS: always take the mutex. + * crash kernels we need a serialization here to prevent multiple crash + * kernels from attempting to load simultaneously. */ - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (flags & KEXEC_ON_CRASH) { @@ -165,7 +162,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, kimage_free(image); out_unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8fa4eeb95b305..5ca4d40c9ec13 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -46,7 +46,7 @@ #include #include "kexec_internal.h" -DEFINE_MUTEX(kexec_mutex); +atomic_t __kexec_lock = ATOMIC_INIT(0); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init); */ void __noclone __crash_kexec(struct pt_regs *regs) { - /* Take the kexec_mutex here to prevent sys_kexec_load + /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - if (mutex_trylock(&kexec_mutex)) { + if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs; @@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - mutex_unlock(&kexec_mutex); + kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); @@ -1008,13 +1008,13 @@ ssize_t crash_get_memory_size(void) { ssize_t size = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); + kexec_unlock(); return size; } @@ -1025,7 +1025,7 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (kexec_crash_image) { @@ -1064,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size) insert_resource(&iomem_resource, ram_res); unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } @@ -1136,7 +1136,7 @@ int kernel_kexec(void) { int error = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1212,6 +1212,6 @@ int kernel_kexec(void) #endif Unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return error; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 1d546dc97c502..45637511e0de6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, image = NULL; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; dest_image = &kexec_image; @@ -411,7 +411,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); + kexec_unlock(); kimage_free(image); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d1..74da1409cd14b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -extern struct mutex kexec_mutex; +/* + * Whatever is used to serialize accesses to the kexec_crash_image needs to be + * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a + * "simple" atomic variable that is acquired with a cmpxchg(). + */ +extern atomic_t __kexec_lock; +static inline bool kexec_trylock(void) +{ + return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0; +} +static inline void kexec_unlock(void) +{ + atomic_set_release(&__kexec_lock, 0); +} #ifdef CONFIG_KEXEC_FILE #include From 4f1d2a030db09af4d21695983674a18633cd0ffb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 16:49:17 +0200 Subject: [PATCH 13/81] llist: use try_cmpxchg in llist_add_batch and llist_del_first Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in llist_add_batch and llist_del_first. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220712144917.4497-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- lib/llist.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/llist.c b/lib/llist.c index 611ce4881a875..7d78b736e8afd 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -30,7 +30,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, do { new_last->next = first = READ_ONCE(head->first); - } while (cmpxchg(&head->first, first, new_first) != first); + } while (!try_cmpxchg(&head->first, &first, new_first)); return !first; } @@ -52,18 +52,14 @@ EXPORT_SYMBOL_GPL(llist_add_batch); */ struct llist_node *llist_del_first(struct llist_head *head) { - struct llist_node *entry, *old_entry, *next; + struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); - for (;;) { + do { if (entry == NULL) return NULL; - old_entry = entry; next = READ_ONCE(entry->next); - entry = cmpxchg(&head->first, old_entry, next); - if (entry == old_entry) - break; - } + } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } From f4068af3a6383da3487c07de85db7732de851734 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Aug 2022 12:50:04 +0300 Subject: [PATCH 14/81] proc: save LOC in vsyscall test Do one fork in vsyscall detection code and let SIGSEGV handler exit and carry information to the parent saving LOC. [adobriyan@gmail.com: redo original patch, delete unnecessary variables, minimise code changes] Link: https://lkml.kernel.org/r/YvoWzAn5dlhF75xa@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Brian Foster Reviewed-by: Brian Foster Tested-by: Brian Foster Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-pid-vm.c | 56 +++++++--------------- 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index e5962f4794f56..69551bfa215c4 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -213,22 +213,22 @@ static int make_exe(const uint8_t *payload, size_t len) /* * 0: vsyscall VMA doesn't exist vsyscall=none - * 1: vsyscall VMA is r-xp vsyscall=emulate - * 2: vsyscall VMA is --xp vsyscall=xonly + * 1: vsyscall VMA is --xp vsyscall=xonly + * 2: vsyscall VMA is r-xp vsyscall=emulate */ -static int g_vsyscall; +static volatile int g_vsyscall; static const char *str_vsyscall; static const char str_vsyscall_0[] = ""; static const char str_vsyscall_1[] = -"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; -static const char str_vsyscall_2[] = "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; +static const char str_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) { - _exit(1); + _exit(g_vsyscall); } /* @@ -255,6 +255,7 @@ static void vsyscall(void) act.sa_sigaction = sigaction_SIGSEGV; (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 0; /* gettimeofday(NULL, NULL); */ asm volatile ( "call %P0" @@ -262,45 +263,20 @@ static void vsyscall(void) : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) : "rax", "rcx", "r11" ); - exit(0); - } - waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page exists and is executable. */ - } else { - /* vsyscall page doesn't exist. */ - g_vsyscall = 0; - return; - } - - pid = fork(); - if (pid < 0) { - fprintf(stderr, "fork, errno %d\n", errno); - exit(1); - } - if (pid == 0) { - struct rlimit rlim = {0, 0}; - (void)setrlimit(RLIMIT_CORE, &rlim); - - /* Hide "segfault at ffffffffff600000" messages. */ - struct sigaction act; - memset(&act, 0, sizeof(struct sigaction)); - act.sa_flags = SA_SIGINFO; - act.sa_sigaction = sigaction_SIGSEGV; - (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 1; *(volatile int *)0xffffffffff600000UL; - exit(0); + + g_vsyscall = 2; + exit(g_vsyscall); } waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page is readable and executable. */ - g_vsyscall = 1; - return; + if (WIFEXITED(wstatus)) { + g_vsyscall = WEXITSTATUS(wstatus); + } else { + fprintf(stderr, "error: wstatus %08x\n", wstatus); + exit(1); } - - /* vsyscall page is executable but unreadable. */ - g_vsyscall = 2; } int main(void) From 2be9880dc87342dc7ae459c9ea5c9ee2a45b33d8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 09:44:06 +0800 Subject: [PATCH 15/81] kernel: exit: cleanup release_thread() Only x86 has own release_thread(), introduce a new weak release_thread() function to clean empty definitions in other ARCHs. Link: https://lkml.kernel.org/r/20220819014406.32266-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Guo Ren [csky] Acked-by: Russell King (Oracle) Acked-by: Geert Uytterhoeven Acked-by: Brian Cain Acked-by: Michael Ellerman [powerpc] Acked-by: Stafford Horne [openrisc] Acked-by: Catalin Marinas [arm64] Acked-by: Huacai Chen [LoongArch] Cc: Alexander Gordeev Cc: Anton Ivanov Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren [csky] Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/processor.h | 2 -- arch/alpha/kernel/process.c | 5 ----- arch/arc/include/asm/processor.h | 3 --- arch/arm/include/asm/processor.h | 3 --- arch/arm/kernel/process.c | 4 ---- arch/arm64/include/asm/processor.h | 3 --- arch/arm64/kernel/process.c | 4 ---- arch/csky/include/asm/processor.h | 5 ----- arch/hexagon/include/asm/processor.h | 4 ---- arch/hexagon/kernel/process.c | 7 ------- arch/ia64/include/asm/processor.h | 7 ------- arch/loongarch/include/asm/processor.h | 3 --- arch/m68k/include/asm/processor.h | 5 ----- arch/microblaze/include/asm/processor.h | 5 ----- arch/mips/include/asm/processor.h | 3 --- arch/nios2/include/asm/processor.h | 5 ----- arch/openrisc/include/asm/processor.h | 1 - arch/openrisc/kernel/process.c | 4 ---- arch/parisc/include/asm/processor.h | 3 --- arch/parisc/kernel/process.c | 4 ---- arch/powerpc/include/asm/processor.h | 1 - arch/powerpc/kernel/process.c | 5 ----- arch/riscv/include/asm/processor.h | 5 ----- arch/s390/include/asm/processor.h | 3 --- arch/sh/include/asm/processor_32.h | 3 --- arch/sh/kernel/process_32.c | 5 ----- arch/sparc/include/asm/processor_32.h | 3 --- arch/sparc/include/asm/processor_64.h | 3 --- arch/um/include/asm/processor-generic.h | 4 ---- arch/x86/include/asm/processor.h | 3 --- arch/xtensa/include/asm/processor.h | 3 --- include/linux/sched/task.h | 3 +++ kernel/exit.c | 4 ++++ 33 files changed, 7 insertions(+), 118 deletions(-) diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 43e234c518b1c..714abe494e5fd 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -36,8 +36,6 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); /* Free all resources held by a thread. */ struct task_struct; -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e2e25f8b5e76c..dbf1bc5e2ad25 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -225,11 +225,6 @@ flush_thread(void) current_thread_info()->pcb.unique = 0; } -void -release_thread(struct task_struct *dead_task) -{ -} - /* * Copy architecture-specific thread state */ diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 54db9d7bb562d..fb844fce1ab67 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -43,9 +43,6 @@ struct task_struct; #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + (void *)task_stack_page(p)) - 1) -/* Free all resources held by a thread */ -#define release_thread(thread) do { } while (0) - /* * A lot of busy-wait loops in SMP are based off of non-volatile data otherwise * get optimised away by gcc diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index bdc35c0e8dfb9..326864f79d18f 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -81,9 +81,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3d9cace638840..712d3e6d9be90 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -232,10 +232,6 @@ void flush_thread(void) thread_notify(THREAD_NOTIFY_FLUSH, thread); } -void release_thread(struct task_struct *dead_task) -{ -} - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 86eb0bfe3b380..4cfb4cd1d4759 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -323,9 +323,6 @@ static inline bool is_ttbr1_addr(unsigned long addr) /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 92bcc1768f0b9..9015f49c206ef 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -279,10 +279,6 @@ void flush_thread(void) flush_tagged_addr_state(); } -void release_thread(struct task_struct *dead_task) -{ -} - void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 9638206bc44f7..63ad71fab30d7 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -69,11 +69,6 @@ do { \ /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 615f7e49968e6..0cd39c2cdf8f7 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -60,10 +60,6 @@ struct thread_struct { #define KSTK_EIP(tsk) (pt_elr(task_pt_regs(tsk))) #define KSTK_ESP(tsk) (pt_psp(task_pt_regs(tsk))) -/* Free all resources held by a thread; defined in process.c */ -extern void release_thread(struct task_struct *dead_task); - -/* Get wait channel for task P. */ extern unsigned long __get_wchan(struct task_struct *p); /* The following stuff is pretty HEXAGON specific. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index f0552f98a7bae..e15eeaebd7853 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -112,13 +112,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) return 0; } -/* - * Release any architecture-specific resources locked by thread - */ -void release_thread(struct task_struct *dead_task) -{ -} - /* * Some archs flush debug and FPU info here */ diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 757c2f6d8d4b8..d1978e0040548 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -318,13 +318,6 @@ struct thread_struct { struct mm_struct; struct task_struct; -/* - * Free all resources held by a thread. This is called after the - * parent of DEAD_TASK has collected the exit status of the task via - * wait(). - */ -#define release_thread(dead_task) - /* Get wait channel for task P. */ extern unsigned long __get_wchan (struct task_struct *p); diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 1c4b4308378d4..6954dc5d24e9d 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -176,9 +176,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while (0) - enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern unsigned long boot_option_idle_override; diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index d86b4009880b4..7a2da780830b8 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -145,11 +145,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); void show_registers(struct pt_regs *regs); diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 7e9e92670df33..4e193c7550dfa 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -63,11 +63,6 @@ struct thread_struct { .pgdir = swapper_pg_dir, \ } -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 4bb24579d12e4..3fde1ff72bd16 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -344,9 +344,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - /* * Do necessary setup to start up a newly executed thread. */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index b8125dfbcad2d..8916d93d5c2d0 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -64,11 +64,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long pc, struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index aa1699c18add8..ed9efb430afa1 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -72,7 +72,6 @@ struct thread_struct { void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); -void release_thread(struct task_struct *); unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 52dc983ddeba3..f94b5ec06786e 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -125,10 +125,6 @@ void show_regs(struct pt_regs *regs) show_registers(regs); } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Copy the thread-specific (arch specific) info from the current * process to the new one p diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 4621ceb513147..a608970b249af 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -266,9 +266,6 @@ on downward growing arches, it looks like this: struct mm_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs.iaoq[0]) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 7c37e09c92da6..3db0e97e6c066 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -146,10 +146,6 @@ void flush_thread(void) */ } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Idle thread support * diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fdfaae194ddd5..92e332415d02c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -75,7 +75,6 @@ extern int _chrp_type; struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); -void release_thread(struct task_struct *); #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0fbda89cd1bb5..991cda25b9a9e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1655,11 +1655,6 @@ EXPORT_SYMBOL_GPL(set_thread_tidr); #endif /* CONFIG_PPC64 */ -void -release_thread(struct task_struct *t) -{ -} - /* * this gets called so that we can store coprocessor state into memory and * copy the current task into the new thread. diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 19eedd4af4cde..94a0590c69710 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -65,11 +65,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index bd66f8e349492..c52fe651eebab 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -186,9 +186,6 @@ struct pt_regs; void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *tsk) { } - /* Free guarded storage control block */ void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 45240ec6b85a4..27aebf1e75a20 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -127,9 +127,6 @@ struct task_struct; extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned long new_sp); -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - /* * FPU lazy state save handling. */ diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index a808843375e71..92b6649d49295 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -84,11 +84,6 @@ void flush_thread(void) #endif } -void release_thread(struct task_struct *dead_task) -{ - /* do nothing */ -} - asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index b26c35336b51d..ba8b70ffec085 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -80,9 +80,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, : "memory"); } -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while(0) - unsigned long __get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 89850dff6b033..2667f35d5ea56 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -176,9 +176,6 @@ do { \ regs->tstate &= ~TSTATE_PEF; \ } while (0) -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while (0) - unsigned long __get_wchan(struct task_struct *task); #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index d0fc1862da957..bb5f06480da95 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -55,10 +55,6 @@ struct thread_struct { .request = { 0 } \ } -static inline void release_thread(struct task_struct *task) -{ -} - /* * User space process size: 3GB (default). */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 356308c739514..67c9d73b31faa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -587,9 +587,6 @@ static inline void load_sp0(unsigned long sp0) #endif /* CONFIG_PARAVIRT_XXL */ -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); /* diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 76bc63127c66e..5abde43c570cd 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -221,9 +221,6 @@ struct thread_struct { struct task_struct; struct mm_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 81cab4b01edcb..d6c48163c6def 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr) void put_task_struct_rcu_user(struct task_struct *task); +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e3..f4b7b058f4e69 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -183,6 +183,10 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; From cba7543e1515e79a85d37df85a0eb2cf0f07d115 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Fri, 19 Aug 2022 08:18:19 +0000 Subject: [PATCH 16/81] fs/qnx6: delete unnecessary checks before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus remove the tests which are not needed around the shown calls. Link: https://lkml.kernel.org/r/20220819081819.96347-1-chi.minghao@zte.com.cn Signed-off-by: Minghao Chi Reported-by: Zeal Robot Cc: CGEL ZTE Cc: Matthew Wilcox Cc: Minghao Chi Cc: Muchun Song Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/qnx6/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b9895afca9d11..85b2fa3b211c9 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -470,10 +470,8 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) out1: iput(sbi->inodes); out: - if (bh1) - brelse(bh1); - if (bh2) - brelse(bh2); + brelse(bh1); + brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; From aa06a9bd853306c239f759018fb227d7e8f4e203 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Sat, 20 Aug 2022 19:18:13 +0100 Subject: [PATCH 17/81] ia64: fix clock_getres(CLOCK_MONOTONIC) to report ITC frequency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clock_gettime(CLOCK_MONOTONIC, &tp) is very precise on ia64 as it uses ITC (similar to rdtsc on x86). It's not quite a hrtimer as it is a few times slower than 1ns. Usually 2-3ns. clock_getres(CLOCK_MONOTONIC, &res) never reflected that fact and reported 0.04s precision (1/HZ value). In https://bugs.gentoo.org/596382 gstreamer's test suite failed loudly when it noticed precision discrepancy. Before the change: clock_getres(CLOCK_MONOTONIC, &res) reported 250Hz precision. After the change: clock_getres(CLOCK_MONOTONIC, &res) reports ITC (400Mhz) precision. The patch is based on matoro's fix. I added a bit of explanation why we need to special-case arch-specific clock_getres(). [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20220820181813.2275195-1-slyich@gmail.com Signed-off-by: Sergei Trofimovich Cc: matoro Cc: Émeric Maschino Signed-off-by: Andrew Morton --- arch/ia64/kernel/sys_ia64.c | 26 ++++++++++++++++++++++++++ arch/ia64/kernel/syscalls/syscall.tbl | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index e14db25146c22..215bf3f8cb204 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -166,3 +166,29 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u force_successful_syscall_return(); return addr; } + +asmlinkage long +ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) +{ + /* + * ia64's clock_gettime() syscall is implemented as a vdso call + * fsys_clock_gettime(). Currently it handles only + * CLOCK_REALTIME and CLOCK_MONOTONIC. Both are based on + * 'ar.itc' counter which gets incremented at a constant + * frequency. It's usually 400MHz, ~2.5x times slower than CPU + * clock frequency. Which is almost a 1ns hrtimer, but not quite. + * + * Let's special-case these timers to report correct precision + * based on ITC frequency and not HZ frequency for supported + * clocks. + */ + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + return put_timespec64(&rtn_tp, tp); + } + + return sys_clock_getres(which_clock, tp); +} diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 78b1d03e86e1d..72c929d9902b9 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -240,7 +240,7 @@ 228 common timer_delete sys_timer_delete 229 common clock_settime sys_clock_settime 230 common clock_gettime sys_clock_gettime -231 common clock_getres sys_clock_getres +231 common clock_getres ia64_clock_getres 232 common clock_nanosleep sys_clock_nanosleep 233 common fstatfs64 sys_fstatfs64 234 common statfs64 sys_statfs64 From 693fc06e98514c2d5951ead4aca40cf8b21100b1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 19:32:55 +0200 Subject: [PATCH 18/81] epoll: use try_cmpxchg in list_add_tail_lockless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in list_add_tail_lockless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Link: https://lkml.kernel.org/r/20220714173255.12987-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b56b94e2f56f..52954d4637b54 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1065,7 +1065,7 @@ static inline bool list_add_tail_lockless(struct list_head *new, * added to the list from another CPU: the winner observes * new->next == new. */ - if (cmpxchg(&new->next, new, head) != new) + if (!try_cmpxchg(&new->next, &new, head)) return false; /* From b0192296b45232872720d969366449c001ab1f4a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 19:16:53 +0200 Subject: [PATCH 19/81] buffer: use try_cmpxchg in discard_buffer Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in discard_buffer. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Note that the value from *ptr should be read using READ_ONCE to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Link: https://lkml.kernel.org/r/20220714171653.12128-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/buffer.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 55e762a58eb65..2fd98bb7d74ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1464,19 +1464,15 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } From 38ace0d513d9d2556beca4d07102d25e9a73c53c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 18:48:51 +0200 Subject: [PATCH 20/81] aio: use atomic_try_cmpxchg in __get_reqs_available Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in __get_reqs_available. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220714164851.3055-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Benjamin LaHaise Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/aio.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 606613e9d1f4f..5b2ff20ad3229 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx) local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { - int old, avail = atomic_read(&ctx->reqs_available); + int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; - - old = avail; - avail = atomic_cmpxchg(&ctx->reqs_available, - avail, avail - ctx->req_batch); - } while (avail != old); + } while (!atomic_try_cmpxchg(&ctx->reqs_available, + &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } From da3f52ba359590d8eb465ae0d8b6e11d6fd9432f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 21 Aug 2022 21:30:11 +0200 Subject: [PATCH 21/81] iversion: use atomic64_try_cmpxchg) Use atomic64_try_cmpxchg instead of atomic64_cmpxchg (*ptr, old, new) == old in inode_set_max_iversion_raw, inode_maybe_inc_version and inode_query_iversion. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The loop in inode_maybe_inc_iversion improves from: 5563: 48 89 ca mov %rcx,%rdx 5566: 48 89 c8 mov %rcx,%rax 5569: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556d: 48 83 c2 02 add $0x2,%rdx 5571: f0 48 0f b1 16 lock cmpxchg %rdx,(%rsi) 5576: 48 39 c1 cmp %rax,%rcx 5579: 0f 84 85 fc ff ff je 5204 <...> 557f: 48 89 c1 mov %rax,%rcx 5582: eb df jmp 5563 <...> to: 5563: 48 89 c2 mov %rax,%rdx 5566: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556a: 48 83 c2 02 add $0x2,%rdx 556e: f0 48 0f b1 11 lock cmpxchg %rdx,(%rcx) 5573: 0f 84 8b fc ff ff je 5204 <...> 5579: eb e8 jmp 5563 <...> Link: https://lkml.kernel.org/r/20220821193011.88208-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/iversion.h | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/include/linux/iversion.h b/include/linux/iversion.h index 3bfebde5a1a6d..eb5a158101693 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -123,17 +123,12 @@ inode_peek_iversion_raw(const struct inode *inode) static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { - u64 cur, old; + u64 cur = inode_peek_iversion_raw(inode); - cur = inode_peek_iversion_raw(inode); - for (;;) { + do { if (cur > val) break; - old = atomic64_cmpxchg(&inode->i_version, cur, val); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** @@ -197,7 +192,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) static inline bool inode_maybe_inc_iversion(struct inode *inode, bool force) { - u64 cur, old, new; + u64 cur, new; /* * The i_version field is not strictly ordered with any other inode @@ -211,19 +206,14 @@ inode_maybe_inc_iversion(struct inode *inode, bool force) */ smp_mb(); cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } @@ -304,10 +294,10 @@ inode_peek_iversion(const struct inode *inode) static inline u64 inode_query_iversion(struct inode *inode) { - u64 cur, old, new; + u64 cur, new; cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { /* @@ -320,11 +310,7 @@ inode_query_iversion(struct inode *inode) } new = cur | I_VERSION_QUERIED; - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } From 948084f0f6959f602f89f679522b706a72da0285 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:25:19 +0200 Subject: [PATCH 22/81] kexec: replace kmap() with kmap_local_page() kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in kexec_core.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in kexec_core.c. Tested on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821182519.9483-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Acked-by: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5ca4d40c9ec13..ca2743f9c634e 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image, if (result < 0) goto out; - ptr = kmap(page); + ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; @@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image, memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); - kunmap(page); + kunmap_local(ptr); if (result) { result = -EFAULT; goto out; @@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image, goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); - ptr = kmap(page); + ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); @@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); - kunmap(page); + kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; From d75e9a4bccf4e19928534dec797935e650f85b09 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:03:58 +0200 Subject: [PATCH 23/81] hfs: unmap the page in the "fail_page" label Patch series "hfs: Replace kmap() with kmap_local_page()". kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmaps pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/hfs is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/hfs. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Fix a bug due to a page being not unmapped if the code jumps to the "fail_page" label (1/3). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. This patch (of 3): Several paths within hfs_btree_open() jump to the "fail_page" label where put_page() is called while the page is still mapped. Call kunmap() to unmap the page soon before put_page(). Link: https://lkml.kernel.org/r/20220821180400.8198-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220821180400.8198-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Matthew Wilcox ] Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 19017d2961734..56c6782436e9c 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -124,6 +124,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke return tree; fail_page: + kunmap(page); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; From ca0ac8dfd35b218b3c95d3b38c695fbff35d94ca Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:03:59 +0200 Subject: [PATCH 24/81] hfs: replace kmap() with kmap_local_page() in bnode.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in bnode.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bnode.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821180400.8198-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/bnode.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c83fd0e8404d3..2015e42e752a6 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int pagenum; int bytes_read; int bytes_to_read; - void *vaddr; off += node->page_offset; pagenum = off >> PAGE_SHIFT; @@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); - vaddr = kmap_atomic(page); - memcpy(buf + bytes_read, vaddr + off, bytes_to_read); - kunmap_atomic(vaddr); + memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ @@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off += node->page_offset; page = node->page[0]; - memcpy(kmap(page) + off, buf, len); - kunmap(page); + memcpy_to_page(page, off, buf, len); set_page_dirty(page); } @@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off += node->page_offset; page = node->page[0]; - memset(kmap(page) + off, 0, len); - kunmap(page); + memzero_page(page, off, len); set_page_dirty(page); } @@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, src_page = src_node->page[0]; dst_page = dst_node->page[0]; - memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); - kunmap(src_page); - kunmap(dst_page); + memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } @@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) src += node->page_offset; dst += node->page_offset; page = node->page[0]; - ptr = kmap(page); + ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); - kunmap(page); + kunmap_local(ptr); set_page_dirty(page); } @@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_SIZE, (int)tree->node_size)); + memzero_page(*pagep, node->page_offset, + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); From 21490eff121555b123f7088b935e40ee43d2c642 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:04:00 +0200 Subject: [PATCH 25/81] hfs: replace kmap() with kmap_local_page() in btree.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in btree.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in btree.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821180400.8198-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/btree.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 56c6782436e9c..2fa4b1f8cc7fb 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -119,12 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: - kunmap(page); + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; @@ -170,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -181,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } @@ -269,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -282,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -291,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); @@ -314,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -361,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); From e1d7c7609ae0933b59840390c1b207ac0a925c8b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:38:51 +0200 Subject: [PATCH 26/81] bitops: use try_cmpxchg in set_mask_bits and bit_clear_unless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in set_mask_bits and bit_clear_unless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Link: https://lkml.kernel.org/r/20220822143851.3290-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/bitops.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 3b89c64bcfd8f..fb24a513d9172 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -328,10 +328,10 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ new__ = (old__ & ~mask__) | bits__; \ - } while (cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) @@ -343,11 +343,12 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ + if (old__ & test__) \ + break; \ new__ = old__ & ~clear__; \ - } while (!(old__ & test__) && \ - cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) From 88040e67b9533003cfd1a2d61ebd17593435322c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:36 +0200 Subject: [PATCH 27/81] alpha: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818205936.6144-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton --- arch/alpha/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index b4fbbba30aa2b..33bf3a6270027 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -491,9 +491,9 @@ setup_arch(char **cmdline_p) boot flags depending on the boot mode, we need some shorthand. This should do for installation. */ if (strcmp(COMMAND_LINE, "INSTALL") == 0) { - strlcpy(command_line, "root=/dev/fd0 load_ramdisk=1", sizeof command_line); + strscpy(command_line, "root=/dev/fd0 load_ramdisk=1", sizeof(command_line)); } else { - strlcpy(command_line, COMMAND_LINE, sizeof command_line); + strscpy(command_line, COMMAND_LINE, sizeof(command_line)); } strcpy(boot_command_line, command_line); *cmdline_p = command_line; From 216e71f13c13a7b3df352742554445907011a3a5 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:39 +0200 Subject: [PATCH 28/81] ia64: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818205940.6216-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- arch/ia64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index fd6301eafa9d5..c057280442727 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -552,7 +552,7 @@ setup_arch (char **cmdline_p) ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); *cmdline_p = __va(ia64_boot_param->command_line); - strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); + strscpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); efi_init(); io_port_init(); From c97e21fe91ed1d59eb36cac1728bcb9a82167c7a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:13 +0200 Subject: [PATCH 29/81] ocfs2: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210123.7637-4-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stackglue.c | 4 ++-- fs/ocfs2/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index dd77b7aaabf5c..317126261523b 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) - strlcpy(new_conn->cc_cluster_name, cluster_name, + strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e2cc9eec287c9..660bc1795848a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2221,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb, goto out_journal; } - strlcpy(osb->vol_label, di->id2.i_super.s_label, + strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); From 512cb7e4c110133e49da9f69885df3ed41aa284f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:53 +0200 Subject: [PATCH 30/81] reiserfs: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210153.8095-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/reiserfs/procfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 4a7cb16e9345c..3dba8acf4e832 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; From a1d3a6d9f243797d1bcaa0ca14c03396bc302ca6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:59 +0200 Subject: [PATCH 31/81] init: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210200.8203-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- init/do_mounts.c | 4 ++-- init/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 7058e14ad5f70..811e94daf0a84 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(name_to_dev_t); static int __init root_dev_setup(char *line) { - strlcpy(saved_root_name, line, sizeof(saved_root_name)); + strscpy(saved_root_name, line, sizeof(saved_root_name)); return 1; } @@ -343,7 +343,7 @@ static int __init split_fs_names(char *page, size_t size, char *names) int count = 1; char *p = page; - strlcpy(p, root_fs_names, size); + strscpy(p, root_fs_names, size); while (*p++) { if (p[-1] == ',') { p[-1] = '\0'; diff --git a/init/main.c b/init/main.c index 1fe7942f5d4a8..a45f9eca40af0 100644 --- a/init/main.c +++ b/init/main.c @@ -422,7 +422,7 @@ static void __init setup_boot_config(void) if (!data) data = xbc_get_embedded_bootconfig(&size); - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, bootconfig_params); @@ -762,7 +762,7 @@ void __init parse_early_param(void) return; /* All fall through to do_early_param. */ - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_early_options(tmp_cmdline); done = 1; } From 977bbf4385fc64986f22b024858071a35c481a8a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:02:03 +0200 Subject: [PATCH 32/81] lib: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210203.8251-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- lib/earlycpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/earlycpio.c b/lib/earlycpio.c index 7921193f04243..d2c37d64fd0c3 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -126,7 +126,7 @@ struct cpio_data find_cpio_data(const char *path, void *data, "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", p, MAX_CPIO_FILE_NAME); } - strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); + strscpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); cd.data = (void *)dptr; cd.size = ch[C_FILESIZE]; From 5fdfa161b2043001f82cbce49e87e8e9f581d510 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 23 Aug 2022 17:26:32 +0200 Subject: [PATCH 33/81] task_work: use try_cmpxchg in task_work_add, task_work_cancel_match and task_work_run Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in task_work_add, task_work_cancel_match and task_work_run. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The patch avoids extra memory read in case cmpxchg fails. Link: https://lkml.kernel.org/r/20220823152632.4517-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- kernel/task_work.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index dff75bcde1514..065e1ef8fc8d7 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work, /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); + head = READ_ONCE(task->task_works); do { - head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; - } while (cmpxchg(&task->task_works, head, work) != head); + } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: @@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task, * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = READ_ONCE(*pprev))) { - if (!match(work, data)) + work = READ_ONCE(*pprev); + while (work) { + if (!match(work, data)) { pprev = &work->next; - else if (cmpxchg(pprev, work, work->next) == work) + work = READ_ONCE(*pprev); + } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -151,16 +153,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + work = READ_ONCE(task->task_works); do { head = NULL; - work = READ_ONCE(task->task_works); if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } - } while (cmpxchg(&task->task_works, work, head) != work); + } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; From 9a15193e23b780d1da77e3db18698beb0637897d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 25 Aug 2022 16:56:03 +0200 Subject: [PATCH 34/81] smpboot: use atomic_try_cmpxchg in cpu_wait_death and cpu_report_death Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in cpu_wait_death and cpu_report_death. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220825145603.5811-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- kernel/smpboot.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9f54544e7499..2c7396da470c5 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds) /* The outgoing CPU will normally get done quite quickly. */ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state; + goto update_state_early; udelay(5); /* But if the outgoing CPU dawdles, wait increasingly long times. */ @@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds) break; sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); } -update_state: +update_state_early: oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +update_state: if (oldstate == CPU_DEAD) { /* Outgoing CPU died normally, update state. */ smp_mb(); /* atomic_read() before update. */ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); } else { /* Outgoing CPU still hasn't died, set state accordingly. */ - if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, CPU_BROKEN) != oldstate) + if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, CPU_BROKEN)) goto update_state; ret = false; } @@ -475,14 +476,14 @@ bool cpu_report_death(void) int newstate; int cpu = smp_processor_id(); + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); do { - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); if (oldstate != CPU_BROKEN) newstate = CPU_DEAD; else newstate = CPU_DEAD_FROZEN; - } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, newstate) != oldstate); + } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, newstate)); return newstate == CPU_DEAD; } From f81259c6dbcefb255fa473090cd975f3827bca89 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:35 +0800 Subject: [PATCH 35/81] fail_function: switch to memdup_user_nul() helper Use memdup_user_nul() helper instead of open-coding to simplify the code. Link: https://lkml.kernel.org/r/20220826073337.2085798-1-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 60dc825ecc2b3..03643e33e4c33 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, buffer, count)) { - ret = -EFAULT; - goto out_free; - } - buf[count] = '\0'; + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + sym = strstrip(buf); mutex_lock(&fei_lock); @@ -308,7 +304,6 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } out: mutex_unlock(&fei_lock); -out_free: kfree(buf); return ret; } From cef9f5f866ad45a2dd64fed6e6b657043c2c6f17 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:36 +0800 Subject: [PATCH 36/81] fail_function: refactor code of checking return value of register_kprobe() Refactor the error handling of register_kprobe() to improve readability. No functional change. Link: https://lkml.kernel.org/r/20220826073337.2085798-2-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 03643e33e4c33..893e8f9a91189 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -294,14 +294,13 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } ret = register_kprobe(&attr->kp); - if (!ret) - fei_debugfs_add_attr(attr); - if (ret < 0) + if (ret) { fei_attr_remove(attr); - else { - list_add_tail(&attr->list, &fei_attr_list); - ret = count; + goto out; } + fei_debugfs_add_attr(attr); + list_add_tail(&attr->list, &fei_attr_list); + ret = count; out: mutex_unlock(&fei_lock); kfree(buf); From d2e85432a2e0a6f31bd9489800f443228f020ed6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:37 +0800 Subject: [PATCH 37/81] fail_function: fix wrong use of fei_attr_remove() If register_kprobe() fails, the new attr is not added to the list yet, so it should call fei_attr_free() intstead. Link: https://lkml.kernel.org/r/20220826073337.2085798-3-yangyingliang@huawei.com Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework") Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 893e8f9a91189..a7ccd2930c5f4 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -295,7 +295,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = register_kprobe(&attr->kp); if (ret) { - fei_attr_remove(attr); + fei_attr_free(attr); goto out; } fei_debugfs_add_attr(attr); From 199cda13534f4c676d7e4601665e971f4f0582c4 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 27 Aug 2022 15:11:16 +0800 Subject: [PATCH 38/81] initramfs: mark my_inptr as __initdata As my_inptr is only used in __init function unpack_to_rootfs(), mark it as __initdata to allow it be freed after boot. Link: https://lkml.kernel.org/r/20220827071116.83078-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: David Disseldorp Cc: Alexander Viro Cc: Martin Wilck Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 18229cfe8906b..2f5bfb7d76521 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -482,7 +482,7 @@ static long __init flush_buffer(void *bufv, unsigned long len) return origLen; } -static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ +static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */ #include From d85a1bec8e8d552ab13163ca1874dcd82f3d1550 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:34 +0800 Subject: [PATCH 39/81] ntfs: fix use-after-free in ntfs_attr_find() Patch series "ntfs: fix bugs about Attribute", v2. This patchset fixes three bugs relative to Attribute in record: Patch 1 adds a sanity check to ensure that, attrs_offset field in first mft record loading from disk is within bounds. Patch 2 moves the ATTR_RECORD's bounds checking earlier, to avoid dereferencing ATTR_RECORD before checking this ATTR_RECORD is within bounds. Patch 3 adds an overflow checking to avoid possible forever loop in ntfs_attr_find(). Without patch 1 and patch 2, the kernel triggersa KASAN use-after-free detection as reported by Syzkaller. Although one of patch 1 or patch 2 can fix this, we still need both of them. Because patch 1 fixes the root cause, and patch 2 not only fixes the direct cause, but also fixes the potential out-of-bounds bug. This patch (of 3): Syzkaller reported use-after-free read as follows: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Kernel will loads $MFT/$DATA's first mft record in ntfs_read_inode_mount(). Yet the problem is that after loading, kernel doesn't check whether attrs_offset field is a valid value. To be more specific, if attrs_offset field is larger than bytes_allocated field, then it may trigger the out-of-bounds read bug(reported as use-after-free bug) in ntfs_attr_find(), when kernel tries to access the corresponding mft record's attribute. This patch solves it by adding the sanity check between attrs_offset field and bytes_allocated field, after loading the first mft record. Link: https://lkml.kernel.org/r/20220831160935.3409-1-yin31149@gmail.com Link: https://lkml.kernel.org/r/20220831160935.3409-2-yin31149@gmail.com Signed-off-by: Hawkins Jiawei Cc: Anton Altaparmakov Cc: ChenXiaoSong Cc: syzkaller-bugs Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/ntfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index db0f1995aedd1..08c659332e26b 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi) goto err_out; } + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); From 36a4d82dddbbd421d2b8e79e1cab68c8126d5075 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:36 +0800 Subject: [PATCH 40/81] ntfs: fix out-of-bounds read in ntfs_attr_find() Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). To ensure access on these ATTR_RECORDs are within bounds, kernel will do some checking during iteration. The problem is that during checking whether ATTR_RECORD's name is within bounds, kernel will dereferences the ATTR_RECORD name_offset field, before checking this ATTR_RECORD strcture is within bounds. This problem may result out-of-bounds read in ntfs_attr_find(), reported by Syzkaller: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== This patch solves it by moving the ATTR_RECORD strcture's bounds checking earlier, then checking whether ATTR_RECORD's name is within bounds. What's more, this patch also add some comments to improve its maintainability. Link: https://lkml.kernel.org/r/20220831160935.3409-3-yin31149@gmail.com Link: https://lore.kernel.org/all/1636796c-c85e-7f47-e96f-e074fee3c7d3@huawei.com/ Link: https://groups.google.com/g/syzkaller-bugs/c/t_XdeKPGTR4/m/LECAuIGcBgAJ Signed-off-by: chenxiaosong (A) Signed-off-by: Dan Carpenter Signed-off-by: Hawkins Jiawei Reported-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Tested-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Cc: Anton Altaparmakov Cc: syzkaller-bugs Signed-off-by: Andrew Morton --- fs/ntfs/attrib.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 52615e6090e1c..cec4be2a2d239 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -594,11 +594,23 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) + break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) From 63095f4f3af59322bea984a6ae44337439348fe0 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:38 +0800 Subject: [PATCH 41/81] ntfs: check overflow when iterating ATTR_RECORDs Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). Because the ATTR_RECORDs are next to each other, kernel can get the next ATTR_RECORD from end address of current ATTR_RECORD, through current ATTR_RECORD length field. The problem is that during iteration, when kernel calculates the end address of current ATTR_RECORD, kernel may trigger an integer overflow bug in executing `a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))`. This may wrap, leading to a forever iteration on 32bit systems. This patch solves it by adding some checks on calculating end address of current ATTR_RECORD during iteration. Link: https://lkml.kernel.org/r/20220831160935.3409-4-yin31149@gmail.com Link: https://lore.kernel.org/all/20220827105842.GM2030@kadam/ Signed-off-by: Hawkins Jiawei Suggested-by: Dan Carpenter Cc: Anton Altaparmakov Cc: chenxiaosong (A) Cc: syzkaller-bugs Signed-off-by: Andrew Morton --- fs/ntfs/attrib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index cec4be2a2d239..a3865bc4a0c65 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -617,6 +617,14 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, return -ENOENT; if (unlikely(!a->length)) break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + if (a->type != type) continue; /* From 35783ccbe519b33f6652b2d7aafcfc82f10b1a1b Mon Sep 17 00:00:00 2001 From: wuchi Date: Thu, 1 Sep 2022 08:31:21 +0800 Subject: [PATCH 42/81] kernel/profile.c: simplify duplicated code in profile_setup() The code to parse option string "schedule/sleep/kvm" of cmdline in function profile_setup is redundant, so simplify that. Link: https://lkml.kernel.org/r/20220901003121.53597-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Andrew Morton Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- kernel/profile.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index 7ea01ba30e757..8a77769bc4b4c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,43 +59,39 @@ int profile_setup(char *str) static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; + const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel sleep profiling enabled (shift: %u)\n", - prof_shift); + select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel schedule profiling enabled (shift: %u)\n", - prof_shift); + select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel KVM profiling enabled (shift: %u)\n", - prof_shift); + select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } + + if (select) { + if (str[strlen(select)] == ',') + str += strlen(select) + 1; + if (get_option(&str, &par)) + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel %s profiling enabled (shift: %u)\n", + select, prof_shift); + } + return 1; } __setup("profile=", profile_setup); From 7b9e664beb237d90bc600f117668227af5ce53ae Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 20:27:13 +0300 Subject: [PATCH 43/81] asm-generic: make parameter types consistent in _unaligned_be48() There is a convention to use internal kernel types, so replace __u8 by u8. Link: https://lkml.kernel.org/r/20220830172713.43686-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Keith Busch Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/asm-generic/unaligned.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index df30f11b4a460..699650f819706 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -126,7 +126,7 @@ static inline void put_unaligned_le24(const u32 val, void *p) __put_unaligned_le24(val, p); } -static inline void __put_unaligned_be48(const u64 val, __u8 *p) +static inline void __put_unaligned_be48(const u64 val, u8 *p) { *p++ = val >> 40; *p++ = val >> 32; From 8ea0114eda0c1c85f8f01922ac8fc1e489a61129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 2 Sep 2022 13:19:23 +0200 Subject: [PATCH 44/81] checkpatch: handle FILE pointer type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using a "FILE *" type, checkpatch considers this an error: ERROR: need consistent spacing around '*' (ctx:WxV) #32: FILE: f.c:8: +static void a(FILE *const b) ^ Fix this by explicitly defining "FILE" as a common type. This is useful for user space patches. With this patch, we now get: <_>WS( ) <_>IDENT(static) <_>WS( ) <_>DECLARE(void ) <_>FUNC(a) PAREN('(') <_>DECLARE(FILE *const ) <_>IDENT(b) <_>PAREN(')') -> V <_>WS( ) 32 > . static void a(FILE *const b) 32 > EEVVVVVVVTTTTTVNTTTTTTTTTTTTVVV 32 > ______________________________ Link: https://lkml.kernel.org/r/20220902111923.1488671-1-mic@digikod.net Link: https://lore.kernel.org/r/20220902111923.1488671-1-mic@digikod.net Signed-off-by: Mickaël Salaün Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9ff219e0a9d56..18effbe1fe908 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -576,10 +576,14 @@ sub hash_show_words { (?:__)?(?:u|s|be|le)(?:8|16|32|64)| atomic_t )}; +our $typeStdioTypedefs = qr{(?x: + FILE +)}; our $typeTypedefs = qr{(?x: $typeC99Typedefs\b| $typeOtherOSTypedefs\b| - $typeKernelTypedefs\b + $typeKernelTypedefs\b| + $typeStdioTypedefs\b )}; our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; From bfca3dd3d0680fc2fc7f659a152234afbac26e4d Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Thu, 1 Sep 2022 21:44:03 +0200 Subject: [PATCH 45/81] kernel/utsname_sysctl.c: print kernel arch Print the machine hardware name (UTS_MACHINE) in /proc/sys/kernel/arch. This helps people who debug kernel with initramfs with minimal environment (i.e. without coreutils or even busybox) or allow to open sysfs file instead of run 'uname -m' in high level languages. Link: https://lkml.kernel.org/r/20220901194403.3819-1-pvorel@suse.cz Signed-off-by: Petr Vorel Acked-by: Greg Kroah-Hartman Cc: David Sterba Cc: "Eric W . Biederman" Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 5 +++++ kernel/utsname_sysctl.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ee6572b1edada..bbaa851946956 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -65,6 +65,11 @@ combining the following values: 4 s3_beep = ======= +arch +==== + +The machine hardware name, the same output as ``uname -m`` +(e.g. ``x86_64`` or ``aarch64``). auto_msgmni =========== diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ca61d49885b6..7ffdd2cd5ff9b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -73,6 +73,13 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); static struct ctl_table uts_kern_table[] = { + { + .procname = "arch", + .data = init_uts_ns.name.machine, + .maxlen = sizeof(init_uts_ns.name.machine), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, { .procname = "ostype", .data = init_uts_ns.name.sysname, From b814751175470b00969a317bf3192260750f9455 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 3 Sep 2022 21:52:33 +0800 Subject: [PATCH 46/81] latencytop: use the last element of latency_record of system In account_global_scheduler_latency(), when we don't find the matching latency_record we try to select one which is unused in latency_record[MAXLR], but the condition will skip the last one. if (i >= MAXLR-1) Fix that. Link: https://lkml.kernel.org/r/20220903135233.5225-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- kernel/latencytop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 76166df011a4d..781249098cb6e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -112,7 +112,7 @@ static void __sched account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) { - int firstnonnull = MAXLR + 1; + int firstnonnull = MAXLR; int i; /* skip kernel threads for now */ @@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk, } i = firstnonnull; - if (i >= MAXLR - 1) + if (i >= MAXLR) return; /* Allocted a new one: */ From 42dd3a50c4489eda8cfffb014d80700e2efbc2a3 Mon Sep 17 00:00:00 2001 From: Gang He Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 47/81] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388e..adc8b81e59bd3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2490,6 +2490,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2598,13 +2599,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2761,11 +2765,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2779,14 +2783,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2799,7 +2796,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2863,7 +2860,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2887,10 +2884,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a36..b2ebaec86ddca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4222,7 +4222,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4230,7 +4230,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4276,13 +4276,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4300,6 +4302,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7d..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From 1b2855130cbd4e8b90f22cb55e5e6ed4d1ca9540 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 48/81] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index adc8b81e59bd3..5e2b04dec2e6a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -454,8 +454,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -599,6 +603,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2028,8 +2034,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From 43baeae1b8be30755f193fd2c359398858fada09 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 49/81] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5e2b04dec2e6a..b98e1531dfcb5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -641,7 +641,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 842ac8e86cb44a539768ead8ad8eb48ca3e71e1e Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 50/81] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index a45f9eca40af0..24f9badf78b6d 100644 --- a/init/main.c +++ b/init/main.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -232,13 +232,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -475,7 +475,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -504,7 +504,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -529,7 +530,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -729,7 +731,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1348,8 +1351,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1500,7 +1505,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From 78e26af38ee499fc53d2597a5e581a62e46a2c7d Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 11 Jun 2022 21:06:34 +0800 Subject: [PATCH 51/81] lib/debugobjects: fix stat count and optimize debug_objects_mem_init. 1. Var debug_objects_allocated tracks valid kmem_cache_alloc calls, so track it in debug_objects_replace_static_objects. Do similar things in object_cpu_offline. 2. In debug_objects_mem_init, there is no need to call function cpuhp_setup_state_nocalls when debug_objects_enabled = 0 (out of memory). Link: https://lkml.kernel.org/r/20220611130634.99741-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Thomas Gleixner Cc: Christoph Hellwig Cc: Kees Cook Cc: Waiman Long Signed-off-by: Andrew Morton --- lib/debugobjects.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 337d797a71416..6f8e5dd1dcd0c 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -437,6 +437,7 @@ static int object_cpu_offline(unsigned int cpu) struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; + unsigned long flags; /* Remote access is safe as the CPU is dead already */ percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); @@ -444,6 +445,12 @@ static int object_cpu_offline(unsigned int cpu) hlist_del(&obj->node); kmem_cache_free(obj_cache, obj); } + + raw_spin_lock_irqsave(&pool_lock, flags); + obj_pool_used -= percpu_pool->obj_free; + debug_objects_freed += percpu_pool->obj_free; + raw_spin_unlock_irqrestore(&pool_lock, flags); + percpu_pool->obj_free = 0; return 0; @@ -1318,6 +1325,8 @@ static int __init debug_objects_replace_static_objects(void) hlist_add_head(&obj->node, &objects); } + debug_objects_allocated += i; + /* * debug_objects_mem_init() is now called early that only one CPU is up * and interrupts have been disabled, so it is safe to replace the @@ -1386,6 +1395,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; kmem_cache_destroy(obj_cache); pr_warn("out of memory.\n"); + return; } else debug_objects_selftest(); From 3918f174df51161af1948c4101f639125d8ac37d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:29 +0000 Subject: [PATCH 52/81] fault-injection: allow stacktrace filter for x86-64 This patchset allow fault injection to run on x86_64 and makes stacktrace filter work as expected. With this, we can test a device driver module with fault injection more easily. This patch (of 4): FAULT_INJECTION_STACKTRACE_FILTER option was apparently disallowed on x86_64 because of problems with the stack unwinder: commit 6d690dcac92a84f98fd774862628ff871b713660 Author: Akinobu Mita Date: Sat May 12 10:36:53 2007 -0700 fault injection: disable stacktrace filter for x86-64 However, there is no problems whatsoever with this today. Let's allow it again. Link: https://lkml.kernel.org/r/20220817080332.1052710-1-weiyongjun1@huawei.com Link: https://lkml.kernel.org/r/20220817080332.1052710-2-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Nathan Chancellor Cc: Peter Zijlstra Cc: Kees Cook Cc: Nick Desaulniers Cc: Josh Poimboeuf Cc: Dan Williams Cc: Miguel Ojeda Cc: Isabella Basso Cc: Vlastimil Babka Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index bcbe60d6c80c1..e687336562a24 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1944,7 +1944,6 @@ config FAIL_SUNRPC config FAULT_INJECTION_STACKTRACE_FILTER bool "stacktrace filter for fault-injection capabilities" depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT - depends on !X86_64 select STACKTRACE depends on FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86 help From df76fa47b1b5103af6ed8c1b8b7944d6753651d4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:30 +0000 Subject: [PATCH 53/81] fault-injection: skip stacktrace filtering by default If FAULT_INJECTION_STACKTRACE_FILTER is enabled, the depth is default to 32. This means fail_stacktrace() will iter each entry's stacktrace, even if filter is not configured. This patch changes to quick return from fail_stacktrace() if stacktrace filter is not set. Link: https://lkml.kernel.org/r/20220817080332.1052710-3-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Dan Williams Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/fault-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 423784d9c058e..515fc5aaf0323 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -74,7 +74,7 @@ static bool fail_stacktrace(struct fault_attr *attr) int n, nr_entries; bool found = (attr->require_start == 0 && attr->require_end == ULONG_MAX); - if (depth == 0) + if (depth == 0 || (found && !attr->reject_start && !attr->reject_end)) return found; nr_entries = stack_trace_save(entries, depth, 1); From 9dca55cf1ea186a58bfffc9e30300a433bd5c8b9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:31 +0000 Subject: [PATCH 54/81] fault-injection: make some stack filter attrs more readable Attributes of stack filter are show as unsigned decimal, such as 'require-start', 'require-end'. This patch change to show them as unsigned hexadecimal for more readable. Before: $ echo 0xffffffffc0257000 > /sys/kernel/debug/failslab/require-start $ cat /sys/kernel/debug/failslab/require-start 18446744072638263296 After: $ echo 0xffffffffc0257000 > /sys/kernel/debug/failslab/require-start $ cat /sys/kernel/debug/failslab/require-start 0xffffffffc0257000 Link: https://lkml.kernel.org/r/20220817080332.1052710-4-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Dan Williams Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/fault-inject.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 515fc5aaf0323..deca05e7c9b30 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -179,6 +179,14 @@ static void debugfs_create_ul(const char *name, umode_t mode, #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER +DEFINE_SIMPLE_ATTRIBUTE(fops_xl, debugfs_ul_get, debugfs_ul_set, "0x%llx\n"); + +static void debugfs_create_xl(const char *name, umode_t mode, + struct dentry *parent, unsigned long *value) +{ + debugfs_create_file(name, mode, parent, value, &fops_xl); +} + static int debugfs_stacktrace_depth_set(void *data, u64 val) { *(unsigned long *)data = @@ -223,10 +231,10 @@ struct dentry *fault_create_debugfs_attr(const char *name, #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER debugfs_create_stacktrace_depth("stacktrace-depth", mode, dir, &attr->stacktrace_depth); - debugfs_create_ul("require-start", mode, dir, &attr->require_start); - debugfs_create_ul("require-end", mode, dir, &attr->require_end); - debugfs_create_ul("reject-start", mode, dir, &attr->reject_start); - debugfs_create_ul("reject-end", mode, dir, &attr->reject_end); + debugfs_create_xl("require-start", mode, dir, &attr->require_start); + debugfs_create_xl("require-end", mode, dir, &attr->require_end); + debugfs_create_xl("reject-start", mode, dir, &attr->reject_start); + debugfs_create_xl("reject-end", mode, dir, &attr->reject_end); #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ attr->dname = dget(dir); From f5c0386f47bd73c05ebd8d3d5d3099676e9d4866 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:32 +0000 Subject: [PATCH 55/81] fault-injection: make stacktrace filter works as expected stacktrace filter is checked after others, such as fail-nth, interval and probability. This make it doesn't work well as expected. Fix to running stacktrace filter before other filters. It will speed up fault inject testing for driver modules. Link: https://lkml.kernel.org/r/20220817080332.1052710-5-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Dan Williams Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/fault-inject.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index deca05e7c9b30..9dd1dd1d26105 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -105,10 +105,16 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + bool stack_checked = false; + if (in_task()) { unsigned int fail_nth = READ_ONCE(current->fail_nth); if (fail_nth) { + if (!fail_stacktrace(attr)) + return false; + + stack_checked = true; fail_nth--; WRITE_ONCE(current->fail_nth, fail_nth); if (!fail_nth) @@ -128,6 +134,9 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (atomic_read(&attr->times) == 0) return false; + if (!stack_checked && !fail_stacktrace(attr)) + return false; + if (atomic_read(&attr->space) > size) { atomic_sub(size, &attr->space); return false; @@ -142,9 +151,6 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (attr->probability <= prandom_u32() % 100) return false; - if (!fail_stacktrace(attr)) - return false; - fail: fail_dump(attr); From 6da70287dc67726c305c90dbccc9b5ef64d6fd51 Mon Sep 17 00:00:00 2001 From: Dmitrii Bundin Date: Mon, 15 Aug 2022 04:33:17 +0300 Subject: [PATCH 56/81] kbuild: add debug level and macro defs options Add config options to control debug info level and producing of macro definitions for GCC/Clang. Option DEBUG_INFO_LEVEL is responsible for controlling debug info level. Before GCC 11 and Clang 12 -gsplit-dwarf implicitly uses -g2. To provide a way to override the setting with, e.g. -g1, DEBUG_INFO_LEVEL is set independently from DEBUG_INFO_SPLIT. Option DEBUG_MACRO_DEFINITIONS is responsible for controlling inclusion of macro definitions. Since Clang uses -fdebug-macro to control if macro definitions are produced which is different from GCC, provides a compiler-specific way of handling macro inclusion. The option is handled after DEBUG_INFO_LEVEL since -g3 -g2 implies -g2, but -g2 -g3 implies -g3 and GCC uses -g3 to produce macro definitions. Link: https://lkml.kernel.org/r/20220815013317.26121-1-dmitrii.bundin.a@gmail.com Signed-off-by: Dmitrii Bundin Cc: Dan Williams Cc: Eric Dumazet Cc: Fangrui Song Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Masahiro Yamada Cc: Michal Marek Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 20 ++++++++++++++++++++ scripts/Makefile.debug | 12 ++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e687336562a24..a6155053612e7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -304,6 +304,26 @@ config DEBUG_INFO_REDUCED DEBUG_INFO build and compile times are reduced too. Only works with newer gcc versions. +config DEBUG_INFO_LEVEL + int "Debug info level" + range 1 3 + default "2" + help + Sets the level of how much debug information to generate (-glevel). + Level 1 produces minimal debug information without including information + about local variables. Level 3 includes extra information like macro + definitions. Setting up level 3 will require significantly more disk + space and increase built time. + +config DEBUG_MACRO_DEFINITIONS + bool "Add macro definitions to debug info" + default n + help + Generates macro definitions to provide a way to expand macros right + in the debugging session. This information can be used with macro expand, + info macro in gdb. This option is equivalent to setting -g3 in GCC and + -fdebug-macro in Clang. + config DEBUG_INFO_COMPRESSED bool "Compressed debugging information" depends on $(cc-option,-gz=zlib) diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug index 9f39b0130551f..29cd04234e75e 100644 --- a/scripts/Makefile.debug +++ b/scripts/Makefile.debug @@ -2,8 +2,6 @@ DEBUG_CFLAGS := ifdef CONFIG_DEBUG_INFO_SPLIT DEBUG_CFLAGS += -gsplit-dwarf -else -DEBUG_CFLAGS += -g endif ifndef CONFIG_AS_IS_LLVM @@ -16,6 +14,16 @@ dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5 DEBUG_CFLAGS += -gdwarf-$(dwarf-version-y) endif +DEBUG_CFLAGS += -g$(CONFIG_DEBUG_INFO_LEVEL) +ifdef CONFIG_DEBUG_MACRO_DEFINITIONS +ifdef CONFIG_CC_IS_GCC +DEBUG_CFLAGS += -g3 +endif +ifdef CONFIG_CC_IS_CLANG +DEBUG_CFLAGS += -fdebug-macro +endif +endif + ifdef CONFIG_DEBUG_INFO_REDUCED DEBUG_CFLAGS += -fno-var-tracking ifdef CONFIG_CC_IS_GCC From 941cd32443e0ed463936a5a67e49ea58a9a3e41e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 3 Sep 2022 00:59:36 +0100 Subject: [PATCH 57/81] ocfs2: replace zero-length arrays with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated and we are moving towards adopting C99 flexible-array members, instead. So, replace zero-length array declarations in a couple of structures and unions with the new DECLARE_FLEX_ARRAY() helper macro. This helper allows for a flexible-array member in a union and as only member in a structure. Also, this addresses multiple warnings reported when building with Clang-15 and -Wzero-length-array. Lastly, this will also help memcpy (in a coming hardening update) execute proper bounds-checking on variable length object i_symlink at fs/ocfs2/namei.c:1973: fs/ocfs2/namei.c: 1973 memcpy((char *) fe->id2.i_symlink, symname, l); Link: https://github.com/KSPP/linux/issues/21 Link: https://github.com/KSPP/linux/issues/193 Link: https://github.com/KSPP/linux/issues/197 Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Link: https://lkml.kernel.org/r/YxKY6O2hmdwNh8r8@work Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ocfs2_fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 638d875eccc7d..7aebdbf5cc0a5 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -527,7 +527,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -548,7 +548,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -727,7 +727,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -892,7 +892,7 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when From 0950831145a4f58905cf86d2cd3f6a2870022d38 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 3 Sep 2022 08:43:30 +0200 Subject: [PATCH 58/81] core_pattern: add CPU specifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Statistically, in a large deployment regular segfaults may indicate a CPU issue. Currently, it is not possible to find out what CPU the segfault happened on. There are at least two attempts to improve segfault logging with this regard, but they do not help in case the logs rotate. Hence, lets make sure it is possible to permanently record a CPU the task ran on using a new core_pattern specifier. Link: https://lkml.kernel.org/r/20220903064330.20772-1-oleksandr@redhat.com Signed-off-by: Oleksandr Natalenko Suggested-by: Renaud Métrich Reviewed-by: Oleg Nesterov Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Grzegorz Halat Cc: "Guilherme G. Piccoli" Cc: "Huang, Ying" Cc: Jason A. Donenfeld Cc: Joel Savitz Cc: Jonathan Corbet Cc: Kees Cook Cc: Laurent Dufour Cc: Luis Chamberlain Cc: Rob Herring Cc: Stephen Kitt Cc: Will Deacon Cc: Xiaoming Ni Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 1 + fs/coredump.c | 5 +++++ include/linux/coredump.h | 1 + 3 files changed, 7 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index bbaa851946956..0dde37d7fce25 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -174,6 +174,7 @@ core_pattern %f executable filename %E executable path %c maximum size of core file by resource limit RLIMIT_CORE + %C CPU the task ran on % both are dropped ======== ========================================== diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae2021093..a5827c5349619 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* CPU the task ran on */ + case 'C': + err = cn_printf(cn, "%d", cprm->cpu); + break; default: break; } @@ -535,6 +539,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) */ .mm_flags = mm->flags, .vma_meta = NULL, + .cpu = raw_smp_processor_id(), }; audit_core_dumps(siginfo->si_signo); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 08a1d3e7e46d0..191dcf5af6cb9 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -22,6 +22,7 @@ struct coredump_params { struct file *file; unsigned long limit; unsigned long mm_flags; + int cpu; loff_t written; loff_t pos; loff_t to_skip; From 30da00eb2a3c68f59cbd07dcaef8dd43f4063b0c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 14:16:56 +0800 Subject: [PATCH 59/81] fs/ocfs2/suballoc.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905061656.1829179-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Acked-by: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/suballoc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 5805a03d100ba..9c74eace3adc1 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, From 116993970b50b20b0e8e8b31e6c80f0a163ee945 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 10:10:34 +0800 Subject: [PATCH 60/81] init.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905021034.947701-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/init.h b/include/linux/init.h index baf0b29a7010a..3254766ebbf23 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -134,7 +134,7 @@ static inline initcall_t initcall_from_entry(initcall_entry_t *entry) extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; -/* Used for contructor calls. */ +/* Used for constructor calls. */ typedef void (*ctor_fn_t)(void); struct file_system_type; From ca9b4719a3d53b7b302dc9549f7e8393c1e66d32 Mon Sep 17 00:00:00 2001 From: Jingyu Wang Date: Fri, 9 Sep 2022 02:54:52 +0800 Subject: [PATCH 61/81] ipc: mqueue: remove unnecessary conditionals iput() already handles null and non-null parameters, so there is no need to use if(). Link: https://lkml.kernel.org/r/20220908185452.76590-1-jingyuwang_vip@163.com Signed-off-by: Jingyu Wang Acked-by: Roman Gushchin Signed-off-by: Andrew Morton --- ipc/mqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f98de32aeea17..9834104a5a31d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -986,8 +986,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) out_unlock: inode_unlock(d_inode(mnt->mnt_root)); - if (inode) - iput(inode); + iput(inode); mnt_drop_write(mnt); out_name: putname(name); From d123f3fdb8f416e23586fa1f1a29cbeed4e6124f Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 9 Sep 2022 17:07:55 -0300 Subject: [PATCH 62/81] firmware: google: test spinlock on panic path to avoid lockups Currently the gsmi driver registers a panic notifier as well as reboot and die notifiers. The callbacks registered are called in atomic and very limited context - for instance, panic disables preemption and local IRQs, also all secondary CPUs (not executing the panic path) are shutdown. With that said, taking a spinlock in this scenario is a dangerous invitation for lockup scenarios. So, fix that by checking if the spinlock is free to acquire in the panic notifier callback - if not, bail-out and avoid a potential hang. Link: https://lkml.kernel.org/r/20220909200755.189679-1-gpiccoli@igalia.com Fixes: 74c5b31c6618 ("driver: Google EFI SMI") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Evan Green Cc: Ard Biesheuvel Cc: David Gow Cc: Greg Kroah-Hartman Cc: Julius Werner Cc: Petr Mladek Signed-off-by: Andrew Morton --- drivers/firmware/google/gsmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index adaa492c3d2df..4e2575dfeb908 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -681,6 +681,15 @@ static struct notifier_block gsmi_die_notifier = { static int gsmi_panic_callback(struct notifier_block *nb, unsigned long reason, void *arg) { + + /* + * Panic callbacks are executed with all other CPUs stopped, + * so we must not attempt to spin waiting for gsmi_dev.lock + * to be released. + */ + if (spin_is_locked(&gsmi_dev.lock)) + return NOTIFY_DONE; + gsmi_shutdown_reason(GSMI_SHUTDOWN_PANIC); return NOTIFY_DONE; } From 46aac981a582910851605e34d0ea37377f4fc6d1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Sep 2022 13:57:41 -0700 Subject: [PATCH 63/81] fs> uninline inode_maybe_inc_iversion() It has many callsites and is large. text data bss dec hex filename 91796 15984 512 108292 1a704 mm/shmem.o-before 91180 15984 512 107676 1a49c mm/shmem.o-after Acked-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- fs/libfs.c | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/iversion.h | 46 +--------------------------------------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31d..682d56345a1cf 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include /* sync_mapping_buffers */ #include @@ -1520,3 +1521,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion); diff --git a/include/linux/iversion.h b/include/linux/iversion.h index eb5a158101693..e27bd4f55d840 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -172,51 +172,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) I_VERSION_QUERIED); } -/** - * inode_maybe_inc_iversion - increments i_version - * @inode: inode with the i_version that should be updated - * @force: increment the counter even if it's not necessary? - * - * Every time the inode is modified, the i_version field must be seen to have - * changed by any observer. - * - * If "force" is set or the QUERIED flag is set, then ensure that we increment - * the value, and clear the queried flag. - * - * In the common case where neither is set, then we can return "false" without - * updating i_version. - * - * If this function returns false, and no other metadata has changed, then we - * can avoid logging the metadata. - */ -static inline bool -inode_maybe_inc_iversion(struct inode *inode, bool force) -{ - u64 cur, new; - - /* - * The i_version field is not strictly ordered with any other inode - * information, but the legacy inode_inc_iversion code used a spinlock - * to serialize increments. - * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. - * - * This barrier pairs with the barrier in inode_query_iversion() - */ - smp_mb(); - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is clear then we needn't do anything */ - if (!force && !(cur & I_VERSION_QUERIED)) - return false; - - /* Since lowest bit is flag, add 2 to avoid it */ - new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return true; -} - +bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version From 3c07bfce92a500f9cb9eab40e3f5d99728f9991e Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 9 Sep 2022 14:25:29 +0200 Subject: [PATCH 64/81] proc: make config PROC_CHILDREN depend on PROC_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2e13ba54a268 ("fs, proc: introduce CONFIG_PROC_CHILDREN") introduces the config PROC_CHILDREN to configure kernels to provide the /proc//task//children file. When one deselects PROC_FS for kernel builds without /proc/, the config PROC_CHILDREN has no effect anymore, but is still visible in menuconfig. Add the dependency on PROC_FS to make the PROC_CHILDREN option disappear for kernel builds without /proc/. Link: https://lkml.kernel.org/r/20220909122529.1941-1-lukas.bulwahn@gmail.com Fixes: 2e13ba54a268 ("fs, proc: introduce CONFIG_PROC_CHILDREN") Signed-off-by: Lukas Bulwahn Cc: Iago López Galeiras Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index c930001056f95..32b1116ae137c 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc//task//children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See From 60584ad8e4d2507dc903c7ad15c2a42f70e3d599 Mon Sep 17 00:00:00 2001 From: wuchi Date: Fri, 9 Sep 2022 18:10:25 +0800 Subject: [PATCH 65/81] relay: use kvcalloc to alloc page array in relay_alloc_page_array kvcalloc() is safer because it will check the integer overflows, and using it will simple the logic of allocation size. Link: https://lkml.kernel.org/r/20220909101025.82955-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Andrew Morton --- kernel/relay.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 6a611e779e958..d7edc934c56d5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - const size_t pa_size = n_pages * sizeof(struct page *); - if (pa_size > PAGE_SIZE) - return vzalloc(pa_size); - return kzalloc(pa_size, GFP_KERNEL); + return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); } /* From 1e44a566100dea889e5a03bcf2f238050dc221b2 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Thu, 8 Sep 2022 21:00:36 +0800 Subject: [PATCH 66/81] fs/ocfs2: fix repeated words in comments Delete the redundant word 'to'. Link: https://lkml.kernel.org/r/20220908130036.31149-1-wangjianli@cdjrlc.com Signed-off-by: wangjianli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b2ebaec86ddca..365025ab2726c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. From e4a722848b90898f7100414074292d555c9a25ff Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Mon, 12 Sep 2022 07:15:57 +0000 Subject: [PATCH 67/81] fork: remove duplicate included header files linux/sched/mm.h is included more than once. Link: https://lkml.kernel.org/r/20220912071556.16811-1-xu.panda@zte.com.cn Signed-off-by: Xu Panda Reported-by: Zeal Robot Cc: Andy Lutomirski Cc: Christian Brauner (Microsoft) Cc: "Eric W . Biederman" Cc: Fenghua Yu Cc: Liam Howlett Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf698..3b8784e3ce97d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,7 +97,6 @@ #include #include #include -#include #include #include From 23ae6a822cb44235cca2551c22263708554a6439 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:37 +0800 Subject: [PATCH 68/81] percpu: Add percpu_counter_add_local and percpu_counter_sub_local Patch series "/msg: mitigate the lock contention in ipc/msg", v6. Here are two patches to mitigate the lock contention in ipc/msg. The 1st patch is to add the new interface percpu_counter_add_local and percpu_counter_sub_local. The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg. The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. This patch (of 2): The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. Link: https://lkml.kernel.org/r/20220913192538.3023708-1-jiebin.sun@intel.com Link: https://lkml.kernel.org/r/20220913192538.3023708-2-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Davidlohr Bueso Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 01861eebed79d..8ed5fba6d156f 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -15,6 +15,9 @@ #include #include +/* percpu_counter batch for local add or sub */ +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX + #ifdef CONFIG_SMP struct percpu_counter { @@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +/* + * With percpu_counter_add_local() and percpu_counter_sub_local(), counts + * are accumulated in local per cpu counter and not in fbc->count until + * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter + * write efficient. + * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be + * used to add up the counts from each CPU to account for all the local + * counts. So percpu_counter_add_local() and percpu_counter_sub_local() + * should be used when a counter is updated frequently and read rarely. + */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); +} + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); @@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) preempt_enable(); } +/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, amount); +} + static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { @@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) percpu_counter_add(fbc, -amount); } +static inline void +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_local(fbc, -amount); +} + #endif /* _LINUX_PERCPU_COUNTER_H */ From 509d162dbafe2d03ed751e033c4df665e340151c Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:38 +0800 Subject: [PATCH 69/81] ipc/msg: mitigate the lock contention with percpu counter The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. Apply the patch and test the pts/stress-ng-1.4.0 -- system v message passing (160 threads). Score gain: 3.99x CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/stress-ng-1.4.0 -- system v message passing (160 threads) Link: https://lkml.kernel.org/r/20220913192538.3023708-3-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Signed-off-by: Andrew Morton --- include/linux/ipc_namespace.h | 5 ++-- ipc/msg.c | 44 ++++++++++++++++++++++++----------- ipc/namespace.c | 5 +++- ipc/util.h | 4 ++-- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e3e8c8662b490..e8240cf2611ad 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -11,6 +11,7 @@ #include #include #include +#include struct user_namespace; @@ -36,8 +37,8 @@ struct ipc_namespace { unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; - atomic_t msg_bytes; - atomic_t msg_hdrs; + struct percpu_counter percpu_msg_bytes; + struct percpu_counter percpu_msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; diff --git a/ipc/msg.c b/ipc/msg.c index a0d05775af2c5..f2bb4c193ecf3 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd == MSG_INFO) { + if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; - msginfo->msgmap = atomic_read(&ns->msg_hdrs); - msginfo->msgtql = atomic_read(&ns->msg_bytes); + max_idx = ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd == MSG_INFO) { + msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs); + msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_idx = ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } @@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; @@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; @@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + + fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); + fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); diff --git a/ipc/namespace.c b/ipc/namespace.c index e1fcaedba4fae..8316ea5857333 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (!setup_ipc_sysctls(ns)) goto fail_mq; + err = msg_init_ns(ns); + if (err) + goto fail_put; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); return ns; diff --git a/ipc/util.h b/ipc/util.h index 2dd7ce0416d8e..1b0086c6346f2 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { } #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } From 8adf62812d7ecfc71d96388dbffb9d84792401fc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 14:45:19 -0700 Subject: [PATCH 70/81] ipc-msg-mitigate-the-lock-contention-with-percpu-counter-checkpatch-fixes WARNING: labels should not be indented #158: FILE: ipc/msg.c:1319: + fail_msg_hdrs: WARNING: labels should not be indented #160: FILE: ipc/msg.c:1321: + fail_msg_bytes: ERROR: space required after that ';' (ctx:VxV) #203: FILE: ipc/util.h:75: +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} ^ total: 1 errors, 2 warnings, 144 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/ipc-msg-mitigate-the-lock-contention-with-percpu-counter.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Jiebin Sun Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Tim Chen Cc: Vasily Averin Signed-off-by: Andrew Morton --- ipc/msg.c | 8 ++++---- ipc/util.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index f2bb4c193ecf3..9fb58de557fba 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1316,10 +1316,10 @@ int msg_init_ns(struct ipc_namespace *ns) ipc_init_ids(&ns->ids[IPC_MSG_IDS]); return 0; - fail_msg_hdrs: - percpu_counter_destroy(&ns->percpu_msg_bytes); - fail_msg_bytes: - return ret; +fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); +fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS diff --git a/ipc/util.h b/ipc/util.h index 1b0086c6346f2..b2906e3665394 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } From 5dcabc54a52554f8014323ba3a8da14cb434d3d0 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Tue, 20 Sep 2022 23:08:09 +0800 Subject: [PATCH 71/81] ipc/msg: avoid negative value by overflow in msginfo The 32-bit value in msginfo struct could be negative if we get it from signed 64-bit. Clamping it to INT_MAX helps to avoid the negative value by overflow. Link: https://lkml.kernel.org/r/20220920150809.4014944-1-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Manfred Spraul Reviewed-by: Tim Chen Signed-off-by: Andrew Morton --- ipc/msg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 9fb58de557fba..6aedde33a22d6 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -501,8 +501,8 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { - msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs); - msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes); + msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); + msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; From 2772cda98ced82c033b587755f15a1b34bf2cf96 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 21 Sep 2022 16:54:33 -0700 Subject: [PATCH 72/81] a In file included from ./include/linux/kernel.h:26, from ./arch/x86/include/asm/percpu.h:27, from ./arch/x86/include/asm/preempt.h:6, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:55, from ./include/linux/ipc.h:5, from ./include/uapi/linux/msg.h:5, from ./include/linux/msg.h:6, from ipc/msg.c:27: ipc/msg.c: In function 'msgctl_info': ./include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast [-Werror] 20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) | ^~ ./include/linux/minmax.h:26:18: note: in expansion of macro '__typecheck' 26 | (__typecheck(x, y) && __no_side_effects(x, y)) | ^~~~~~~~~~~ ./include/linux/minmax.h:36:31: note: in expansion of macro '__safe_cmp' 36 | __builtin_choose_expr(__safe_cmp(x, y), \ | ^~~~~~~~~~ ./include/linux/minmax.h:45:25: note: in expansion of macro '__careful_cmp' 45 | #define min(x, y) __careful_cmp(x, y, <) | ^~~~~~~~~~~~~ ipc/msg.c:504:35: note: in expansion of macro 'min' 504 | msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); | ^~~ ./include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast [-Werror] 20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) | ^~ ./include/linux/minmax.h:26:18: note: in expansion of macro '__typecheck' 26 | (__typecheck(x, y) && __no_side_effects(x, y)) | ^~~~~~~~~~~ ./include/linux/minmax.h:36:31: note: in expansion of macro '__safe_cmp' 36 | __builtin_choose_expr(__safe_cmp(x, y), \ | ^~~~~~~~~~ ./include/linux/minmax.h:45:25: note: in expansion of macro '__careful_cmp' 45 | #define min(x, y) __careful_cmp(x, y, <) | ^~~~~~~~~~~~~ ipc/msg.c:505:35: note: in expansion of macro 'min' 505 | msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); | ^~~ Signed-off-by: Andrew Morton --- ipc/msg.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 6aedde33a22d6..e4e0990e08f75 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -501,8 +501,12 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { - msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); - msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); + msginfo->msgmap = min_t(int, + percpu_counter_sum(&ns->percpu_msg_hdrs), + INT_MAX); + msginfo->msgtql = min_t(int, + percpu_counter_sum(&ns->percpu_msg_bytes), + INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; From 756b8640c689f4a08e7fa332422fdbabfa105bd7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:16 +0900 Subject: [PATCH 73/81] libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value Patch series "fix error when writing negative value to simple attribute files". The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), but some attribute files want to accept a negative value. This patch (of 3): The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value. This adds DEFINE_SIMPLE_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-1-akinobu.mita@gmail.com Link: https://lkml.kernel.org/r/20220919172418.45257-2-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- fs/libfs.c | 22 +++++++++++++++++++--- include/linux/fs.h | 12 ++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 682d56345a1cf..aada4e7c87132 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -995,8 +995,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ -ssize_t simple_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; @@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - ret = kstrtoull(attr->set_buf, 0, &val); + if (is_signed) + ret = kstrtoll(attr->set_buf, 0, &val); + else + ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); @@ -1027,8 +1030,21 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, mutex_unlock(&attr->mutex); return ret; } + +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(simple_attr_write); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(simple_attr_write_signed); + /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286ee..c791388189223 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3471,7 +3471,7 @@ void simple_transaction_set(struct file *file, size_t n); * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ -#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -3482,10 +3482,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ - .write = simple_attr_write, \ + .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } +#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { @@ -3500,6 +3506,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); From 7f8cb45b40118d90c54f681628da28b14119913d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:17 +0900 Subject: [PATCH 74/81] lib/notifier-error-inject: fix error when writing -errno to debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"). This restores the previous behaviour by using newly introduced DEFINE_SIMPLE_ATTRIBUTE_SIGNED instead of DEFINE_SIMPLE_ATTRIBUTE. Link: https://lkml.kernel.org/r/20220919172418.45257-3-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- lib/notifier-error-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c index 21016b32d3131..2b24ea6c94979 100644 --- a/lib/notifier-error-inject.c +++ b/lib/notifier-error-inject.c @@ -15,7 +15,7 @@ static int debugfs_errno_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set, +DEFINE_SIMPLE_ATTRIBUTE_SIGNED(fops_errno, debugfs_errno_get, debugfs_errno_set, "%lld\n"); static struct dentry *debugfs_create_errno(const char *name, umode_t mode, From e28b020aa9d5f374bd0bac71b99c0b452a199905 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:18 +0900 Subject: [PATCH 75/81] debugfs: fix error when writing negative value to atomic_t debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value for a debugfs file created by debugfs_create_atomic_t(). This restores the previous behaviour by introducing DEFINE_DEBUGFS_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-4-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- .../fault-injection/fault-injection.rst | 10 +++---- fs/debugfs/file.c | 28 +++++++++++++++---- include/linux/debugfs.h | 19 +++++++++++-- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 17779a2772e51..5f6454b9dbd4d 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -83,9 +83,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail*/times: specifies how many times failures may happen at most. A value of -1 - means "no limit". Note, though, that this file only accepts unsigned - values. So, if you want to specify -1, you better use 'printf' instead - of 'echo', e.g.: $ printf %#x -1 > times + means "no limit". - /sys/kernel/debug/fail*/space: @@ -284,7 +282,7 @@ Application Examples echo Y > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait @@ -338,7 +336,7 @@ Application Examples echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait @@ -369,7 +367,7 @@ Application Examples echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 100 > /sys/kernel/debug/$FAILTYPE/probability echo 0 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 1 > /sys/kernel/debug/$FAILTYPE/verbose diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 950c63fa4d0b2..38930d9b0bb73 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, } EXPORT_SYMBOL_GPL(debugfs_attr_read); -ssize_t debugfs_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; @@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; - ret = simple_attr_write(file, buf, len, ppos); + if (is_signed) + ret = simple_attr_write_signed(file, buf, len, ppos); + else + ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(debugfs_attr_write); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); + static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, @@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index c869f1e73d755..6db34e9c50ec9 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,7 +45,7 @@ struct debugfs_u32_array { extern struct dentry *arch_debugfs_dir; -#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -56,10 +56,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = debugfs_attr_read, \ - .write = debugfs_attr_write, \ + .write = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write, \ .llseek = no_llseek, \ } +#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); #if defined(CONFIG_DEBUG_FS) @@ -100,6 +106,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name); @@ -248,6 +256,13 @@ static inline ssize_t debugfs_attr_write(struct file *file, return -ENODEV; } +static inline ssize_t debugfs_attr_write_signed(struct file *file, + const char __user *buf, + size_t len, loff_t *ppos) +{ + return -ENODEV; +} + static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, char *new_name) { From 2fe614af589075d4bc6899b040a2eaa88cecd5e7 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 19 Sep 2022 09:44:06 +0800 Subject: [PATCH 76/81] usr/gen_init_cpio.c: remove unnecessary -1 values from int file The file variable is assigned first, it does not need to be initialized. Link: https://lkml.kernel.org/r/20220919014406.3242-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Li zeming Cc: Masahiro Yamada Cc: Nicolas Schier Signed-off-by: Andrew Morton --- usr/gen_init_cpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index dc838e26a5b9a..ee01e40e8bc65 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -326,7 +326,7 @@ static int cpio_mkfile(const char *name, const char *location, char s[256]; struct stat buf; unsigned long size; - int file = -1; + int file; int retval; int rc = -1; int namesize; From 3b0e73b2b82e95089aa5ab008f4cf5363f3c7d05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 14 Sep 2022 12:02:55 +0200 Subject: [PATCH 77/81] checkpatch: warn for non-standard fixes tag style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a warning for fixes tags that does not follow community conventions. Link: https://lkml.kernel.org/r/20220914100255.1048460-1-niklas.soderlund@corigine.com Signed-off-by: Niklas Söderlund Reviewed-by: Simon Horman Reviewed-by: Louis Peens Reviewed-by: Philippe Schenker Acked-by: Dwaipayan Ray Reviewed-by: Lukas Bulwahn Acked-by: Lukas Bulwahn Acked-by: Joe Perches Signed-off-by: Andrew Morton --- Documentation/dev-tools/checkpatch.rst | 7 ++++ scripts/checkpatch.pl | 44 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index b52452bc29636..c3389c6f38381 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -612,6 +612,13 @@ Commit message See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + **BAD_FIXES_TAG** + The Fixes: tag is malformed or does not follow the community conventions. + This can occur if the tag have been split into multiple lines (e.g., when + pasted in an email program with word wrapping enabled). + + See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + Comparison style ---------------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 18effbe1fe908..e8e0542f29f02 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3146,6 +3146,50 @@ sub process { } } +# Check Fixes: styles is correct + if (!$in_header_lines && + $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) { + my $orig_commit = ""; + my $id = "0123456789ab"; + my $title = "commit title"; + my $tag_case = 1; + my $tag_space = 1; + my $id_length = 1; + my $id_case = 1; + my $title_has_quotes = 0; + + if ($line =~ /(\s*fixes:?)\s+([0-9a-f]{5,})\s+($balanced_parens)/i) { + my $tag = $1; + $orig_commit = $2; + $title = $3; + + $tag_case = 0 if $tag eq "Fixes:"; + $tag_space = 0 if ($line =~ /^fixes:? [0-9a-f]{5,} ($balanced_parens)/i); + + $id_length = 0 if ($orig_commit =~ /^[0-9a-f]{12}$/i); + $id_case = 0 if ($orig_commit !~ /[A-F]/); + + # Always strip leading/trailing parens then double quotes if existing + $title = substr($title, 1, -1); + if ($title =~ /^".*"$/) { + $title = substr($title, 1, -1); + $title_has_quotes = 1; + } + } + + my ($cid, $ctitle) = git_commit_info($orig_commit, $id, + $title); + + if ($ctitle ne $title || $tag_case || $tag_space || + $id_length || $id_case || !$title_has_quotes) { + if (WARN("BAD_FIXES_TAG", + "Please use correct Fixes: style 'Fixes: <12 chars of sha1> (\"\")' - ie: 'Fixes: $cid (\"$ctitle\")'\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] = "Fixes: $cid (\"$ctitle\")"; + } + } + } + # Check email subject for common tools that don't need to be mentioned if ($in_header_lines && $line =~ /^Subject:.*\b(?:checkpatch|sparse|smatch)\b[^:]/i) { From 29ff8d7471c45a75a436682eb82090ef621086ad Mon Sep 17 00:00:00 2001 From: Minghao Chi <chi.minghao@zte.com.cn> Date: Wed, 21 Sep 2022 12:48:02 +0900 Subject: [PATCH 78/81] nilfs2: delete unnecessary checks before brelse() Patch series "nilfs2 minor amendments". This patch (of 2): The brelse() inline function tests whether its argument is NULL and then returns immediately. Thus remove the tests which are not needed around the shown calls. Link: https://lkml.kernel.org/r/20220921034803.2476-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20220819081700.96279-1-chi.minghao@zte.com.cn Link: https://lkml.kernel.org/r/20220921034803.2476-2-konishi.ryusuke@gmail.com Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: ye xingchen <ye.xingchen@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/btree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 9f4d9432d38a1..b9d15c3df3cc1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; - if (bh != NULL) - brelse(bh); + brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } @@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree, ptrs[i] = le64_to_cpu(dptrs[i]); } - if (bh != NULL) - brelse(bh); + brelse(bh); return nitems; } From e87d0692e8004863eecee13ad7352e4dfb140322 Mon Sep 17 00:00:00 2001 From: ye xingchen <ye.xingchen@zte.com.cn> Date: Wed, 21 Sep 2022 12:48:03 +0900 Subject: [PATCH 79/81] nilfs2: Remove the unneeded result variable Return the value nilfs_segctor_sync() directly instead of storing it in another redundant variable. Link: https://lkml.kernel.org/r/20220831033403.302184-1-ye.xingchen@zte.com.cn Link: https://lkml.kernel.org/r/20220921034803.2476-3-konishi.ryusuke@gmail.com Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Minghao Chi <chi.minghao@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/segment.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c7547..9abae2c9120ed 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2235,7 +2235,6 @@ int nilfs_construct_segment(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - int err; if (!sci) return -EROFS; @@ -2243,8 +2242,7 @@ int nilfs_construct_segment(struct super_block *sb) /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); - err = nilfs_segctor_sync(sci); - return err; + return nilfs_segctor_sync(sci); } /** From 8928c99cb2674aeffe3acf53c8e9925f9da87911 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Tue, 20 Sep 2022 20:35:23 +0300 Subject: [PATCH 80/81] proc: mark more files as permanent Mark /proc/devices /proc/kpagecount /proc/kpageflags /proc/kpagecgroup /proc/loadavg /proc/meminfo /proc/softirqs /proc/uptime /proc/version as permanent /proc entries, saving alloc/free and some list/spinlock ops per use. These files are never removed by the kernel so it is OK to mark them. Link: https://lkml.kernel.org/r/Yyn527DzDMa+r0Yj@localhost.localdomain Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/proc/devices.c | 6 +++++- fs/proc/internal.h | 5 +++++ fs/proc/loadavg.c | 6 +++++- fs/proc/meminfo.c | 5 ++++- fs/proc/page.c | 3 +++ fs/proc/softirqs.c | 6 +++++- fs/proc/uptime.c | 6 +++++- fs/proc/version.c | 6 +++++- 8 files changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 837971e741097..fe7bfcb7d0494 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { @@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = { static int __init proc_devices_init(void) { - proc_create_seq("devices", 0, NULL, &devinfo_ops); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d8..af277184b8073 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index f32878d9a39f3..817981e57223e 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v) static int __init proc_loadavg_init(void) { - proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6e89f0e2fd20f..70e5294052d52 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -162,7 +162,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) static int __init proc_meminfo_init(void) { - proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index a2873a617ae86..f2273b164535b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; @@ -268,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, } static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpageflags_read, }; @@ -322,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecgroup_read, }; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 12901dcf57e2b..f4616083faef3 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -3,6 +3,7 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v) static int __init proc_softirqs_init(void) { - proc_create_single("softirqs", 0, NULL, show_softirqs); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index deb99bc9b7e6b..b5343d209381a 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { @@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) static int __init proc_uptime_init(void) { - proc_create_single("uptime", 0, NULL, uptime_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index b449f186577f8..02e3c3cd4a9af 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -5,6 +5,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v) static int __init proc_version_init(void) { - proc_create_single("version", 0, NULL, version_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); From 344f44d098da8f9694db36316b7a42d874796df7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Date: Wed, 21 Sep 2022 00:59:27 +0900 Subject: [PATCH 81/81] fs/ntfs3: fix negative shift size in true_sectors_per_clst() syzbot is reporting shift-out-of-bounds in true_sectors_per_clst() [1], for commit a3b774342fa752a5 ("fs/ntfs3: validate BOOT sectors_per_clusters") did not address that (0 - boot->sectors_per_clusters) < 0 because "u8" was chosen for type of boot->sectors_per_clusters because 0x80 needs to be positive in order to support 64K clusters. Use "s8" cast in order to make sure that (0 - (s8) boot->sectors_per_clusters) > 0. Link: https://syzkaller.appspot.com/bug?extid=1631f09646bc214d2e76 [1] Link: https://lkml.kernel.org/r/4b37f037-3b10-b4e4-0644-73441c8fa0af@I-love.SAKURA.ne.jp Fixes: a3b774342fa752a5 ("fs/ntfs3: validate BOOT sectors_per_clusters") Reported-by: syzbot <syzbot+1631f09646bc214d2e76@syzkaller.appspotmail.com> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Tested-by: syzbot <syzbot+1631f09646bc214d2e76@syzkaller.appspotmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ntfs3/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 47012c9bf505e..c7ffd21fb255d 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -672,7 +672,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) if (boot->sectors_per_clusters <= 0x80) return boot->sectors_per_clusters; if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ - return 1U << (0 - boot->sectors_per_clusters); + return 1U << (0 - (s8) boot->sectors_per_clusters); return -EINVAL; }