From a705b11b358dee677aad80630e7608b2d5f56691 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 8 Jul 2023 17:17:27 +0800 Subject: [PATCH 01/10] md/raid5-cache: fix a deadlock in r5l_exit_log() Commit b13015af94cf ("md/raid5-cache: Clear conf->log after finishing work") introduce a new problem: // caller hold reconfig_mutex r5l_exit_log flush_work(&log->disable_writeback_work) r5c_disable_writeback_async wait_event /* * conf->log is not NULL, and mddev_trylock() * will fail, wait_event() can never pass. */ conf->log = NULL Fix this problem by setting 'config->log' to NULL before wake_up() as it used to be, so that wait_event() from r5c_disable_writeback_async() can exist. In the meantime, move forward md_unregister_thread() so that null-ptr-deref this commit fixed can still be fixed. Fixes: b13015af94cf ("md/raid5-cache: Clear conf->log after finishing work") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20230708091727.1417894-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 47ba7d9e81e18..2eac4a50d99bd 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3168,12 +3168,15 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - /* Ensure disable_writeback_work wakes up and exits */ - wake_up(&conf->mddev->sb_wait); - flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); + /* + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to + * ensure disable_writeback_work wakes up and exits. + */ conf->log = NULL; + wake_up(&conf->mddev->sb_wait); + flush_work(&log->disable_writeback_work); mempool_exit(&log->meta_pool); bioset_exit(&log->bs); From 8b0472b50bcf0f19a5119b00a53b63579c8e1e4d Mon Sep 17 00:00:00 2001 From: Zhang Shurong Date: Sat, 22 Jul 2023 15:53:53 +0800 Subject: [PATCH 02/10] md: raid1: fix potential OOB in raid1_remove_disk() If rddev->raid_disk is greater than mddev->raid_disks, there will be an out-of-bounds in raid1_remove_disk(). We have already found similar reports as follows: 1) commit d17f744e883b ("md-raid10: fix KASAN warning") 2) commit 1ebc2cec0b7d ("dm raid: fix KASAN warning in raid5_remove_disk") Fix this bug by checking whether the "number" variable is valid. Signed-off-by: Zhang Shurong Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/tencent_0D24426FAC6A21B69AC0C03CE4143A508F09@qq.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 23d211969565e..b920f92780139 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1837,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; + + if (unlikely(number >= conf->raid_disks)) + goto abort; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) From 892da88d1cd93426e9c6d7717876ca705fe2b9fa Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 31 Jul 2023 10:28:00 +0800 Subject: [PATCH 03/10] md/raid10: fix a 'conf->barrier' leakage in raid10_takeover() After commit b39f35ebe86d ("md: don't quiesce in mddev_suspend()"), 'conf->barrier' will be leaked in the case that raid10 takeover raid0: level_store pers->takeover -> raid10_takeover raid10_takeover_raid0 WRITE_ONCE(conf->barrier, 1) mddev_suspend // still raid0 mddev->pers = pers // switch to raid10 mddev_resume // resume without suspend After the above commit, mddev_resume() will not decrease 'conf->barrier' that is set in raid10_takeover_raid0(). Fix this problem by not setting 'conf->barrier' in raid10_takeover_raid0(). By the way, this problem is found while I'm trying to make mddev_suspend/resume() to be independent from raid personalities. raid10 is the only personality to use reference count in the quiesce() callback and this problem is only related to raid10. Fixes: b39f35ebe86d ("md: don't quiesce in mddev_suspend()") Signed-off-by: Yu Kuai Reviewed-by: Paul Menzel Link: https://lore.kernel.org/r/20230731022800.1424902-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 16aa9d735880a..7704a4c7f4695 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4417,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - WRITE_ONCE(conf->barrier, 1); } return conf; From 7eb8ff02c1df279bf7f7f29b866beb655a9eebe9 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 3 Aug 2023 15:17:11 +0800 Subject: [PATCH 04/10] md: Hold mddev->reconfig_mutex when trying to get mddev->sync_thread Commit ba9d9f1a707f ("Revert "md: unlock mddev before reap sync_thread in action_store"") removed the scenario of calling md_unregister_thread() without holding mddev->reconfig_mutex, so add a lock holding check before acquiring mddev->sync_thread by passing mdev to md_unregister_thread(). Signed-off-by: Li Lingfeng Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230803071711.2546560-1-lilingfeng@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 8 ++++---- drivers/md/md.c | 9 +++++---- drivers/md/md.h | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 2 +- drivers/md/raid5-cache.c | 2 +- drivers/md/raid5.c | 2 +- 7 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3d9fd74233dfd..1e26eb2233495 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes) return 0; err: set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); @@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev) resync_bitmap(mddev); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); diff --git a/drivers/md/md.c b/drivers/md/md.c index a3d98273b295c..5c3c19b8d5099 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6258,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); if (mddev->queue) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ } @@ -7990,9 +7990,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread __rcu **threadp) +void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { - struct md_thread *thread = rcu_dereference_protected(*threadp, true); + struct md_thread *thread = rcu_dereference_protected(*threadp, + lockdep_is_held(&mddev->reconfig_mutex)); if (!thread) return; @@ -9484,7 +9485,7 @@ void md_reap_sync_thread(struct mddev *mddev) bool is_reshaped = false; /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + md_unregister_thread(mddev, &mddev->sync_thread); atomic_inc(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && diff --git a/drivers/md/md.h b/drivers/md/md.h index 8ae9574809763..9bcb77bca9639 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -761,7 +761,7 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b920f92780139..c18b7c096c8df 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3156,7 +3156,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { - md_unregister_thread(&conf->thread); + md_unregister_thread(mddev, &conf->thread); ret = -EINVAL; goto abort; } @@ -3183,7 +3183,7 @@ static int raid1_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); goto abort; } return 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7704a4c7f4695..0234131208516 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4320,7 +4320,7 @@ static int raid10_run(struct mddev *mddev) return 0; out_free_conf: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); raid10_free_conf(conf); mddev->private = NULL; out: diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2eac4a50d99bd..a29b9650260a0 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3168,7 +3168,7 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - md_unregister_thread(&log->reclaim_thread); + md_unregister_thread(conf->mddev, &log->reclaim_thread); /* * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 32a87193bad73..4cb9c608ee191 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8107,7 +8107,7 @@ static int raid5_run(struct mddev *mddev) return 0; abort: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); print_raid5_conf(conf); free_conf(conf); mddev->private = NULL; From 5afcf28d07dee91e48d1c809ebd19c3bfc403765 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:07 +0800 Subject: [PATCH 05/10] raid6: remove the include from recov.c There is no exported symbol left in recov.c, so the include is now unnecessary, and breaks the raid6test build. Remove it. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-2-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/recov.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index e49d519de6cbe..a7c1b2bbe40d8 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -13,7 +13,6 @@ * the syndrome.) */ -#include #include /* Recover two failed data blocks. */ From 9dd6e1da811ffad95a79b2690110ef1bbaf4dda4 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:08 +0800 Subject: [PATCH 06/10] raid6: guard the tables.c include of with __KERNEL__ The export directives for the tables are already emitted with __KERNEL__ guards, but the include is not, causing errors when building the raid6test program. Guard this include too to fix the raid6test build. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-3-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/mktables.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index f02e10fa62381..3be03793237c2 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -56,7 +56,9 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; + printf("#ifdef __KERNEL__\n"); printf("#include \n"); + printf("#endif\n"); printf("#include \n"); /* Compute multiplication table */ From 2008d89fb6435a3a900b72b856a18e0cc0d2c057 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:09 +0800 Subject: [PATCH 07/10] raid6: test: cosmetic cleanups for the test Makefile Use tabs/spaces consistently: hard tabs for marking recipe lines only, spaces for everything else. Also, the OPTFLAGS declaration actually included the tabs preceding the line comment, making compiler invocation lines unnecessarily long. As the entire block of declarations are meant for ad-hoc customization (otherwise they would probably make use of `?=` instead of `=`), move the "Adjust as desired" comment above the block too to fix the long invocation lines. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-4-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/test/Makefile | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 4fb7700a741bd..143cda60faa12 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -6,14 +6,15 @@ pound := \# -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) -LD = ld -AWK = awk -f -AR = ar -RANLIB = ranlib -OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o +# Adjust as desired +CC = gcc +OPTFLAGS = -O2 +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ifeq ($(ARCH),i386) @@ -37,9 +38,9 @@ endif ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 - CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ - gcc -c -x assembler - >/dev/null 2>&1 && \ - rm ./-.o && echo -DCONFIG_AS_AVX512=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 @@ -63,12 +64,12 @@ endif %.uc: ../%.uc cp -f $< $@ -all: raid6.a raid6test +all: raid6.a raid6test raid6.a: $(OBJS) - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ From 6601f5e122e5fdcea0fa5eaa54b88b02dbc9ec07 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:10 +0800 Subject: [PATCH 08/10] raid6: test: make sure all intermediate and artifact files are .gitignored Currently when the raid6test utility is built, the resulting binary and an int.uc file are not being ignored, which can get inadvertently committed as a result when one works on the raid6 code. Ignore them to make `git status` clean at all times. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-5-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/test/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 lib/raid6/test/.gitignore diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore new file mode 100644 index 0000000000000..1b68a77f348f6 --- /dev/null +++ b/lib/raid6/test/.gitignore @@ -0,0 +1,3 @@ +/int.uc +/neon.uc +/raid6test From 7b3c70c43c13ad8e59f5561b154663d6bdb77021 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:11 +0800 Subject: [PATCH 09/10] raid6: test: only check for Altivec if building on powerpc hosts Altivec is only available for powerpc hosts, so only check for its availability when the host is powerpc, to avoid error messages being shown on architectures other than x86, arm or powerpc. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-6-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/test/Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 143cda60faa12..1f693ea3b980c 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -35,6 +35,12 @@ ifeq ($(ARCH),aarch64) HAS_NEON = yes endif +ifeq ($(findstring ppc,$(ARCH)),ppc) + CFLAGS += -I../../../arch/powerpc/include + HAS_ALTIVEC := $(shell printf '$(pound)include \nvector int a;\n' |\ + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) +endif + ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 @@ -44,15 +50,10 @@ ifeq ($(IS_X86),yes) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 -else - HAS_ALTIVEC := $(shell printf '$(pound)include \nvector int a;\n' |\ - gcc -c -x c - >/dev/null && rm ./-.o && echo yes) - ifeq ($(HAS_ALTIVEC),yes) - CFLAGS += -I../../../arch/powerpc/include - CFLAGS += -DCONFIG_ALTIVEC - OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ - vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o - endif +else ifeq ($(HAS_ALTIVEC),yes) + CFLAGS += -DCONFIG_ALTIVEC + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o endif .c.o: From 0d0bd28c500173bfca78aa840f8f36d261ef1765 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 8 Aug 2023 18:49:12 +0800 Subject: [PATCH 10/10] md/raid5-cache: fix null-ptr-deref for r5l_flush_stripe_to_raid() r5l_flush_stripe_to_raid() will check if the list 'flushing_ios' is empty, and then submit 'flush_bio', however, r5l_log_flush_endio() is clearing the list first and then clear the bio, which will cause null-ptr-deref: T1: submit flush io raid5d handle_active_stripes r5l_flush_stripe_to_raid // list is empty // add 'io_end_ios' to the list bio_init submit_bio // io1 T2: io1 is done r5l_log_flush_endio list_splice_tail_init // clear the list T3: submit new flush io ... r5l_flush_stripe_to_raid // list is empty // add 'io_end_ios' to the list bio_init bio_uninit // clear bio->bi_blkg submit_bio // null-ptr-deref Fix this problem by clearing bio before clearing the list in r5l_log_flush_endio(). Fixes: 0dd00cba99c3 ("raid5-cache: fully initialize flush_bio when needed") Reported-and-tested-by: Corey Hickey Closes: https://lore.kernel.org/all/cddd7213-3dfd-4ab7-a3ac-edd54d74a626@fatooh.org/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a29b9650260a0..518b7cfa78b9d 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio) if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); + bio_uninit(bio); spin_lock_irqsave(&log->io_list_lock, flags); list_for_each_entry(io, &log->flushing_ios, log_sibling) r5l_io_run_stripes(io); list_splice_tail_init(&log->flushing_ios, &log->finished_ios); spin_unlock_irqrestore(&log->io_list_lock, flags); - - bio_uninit(bio); } /*