From d798bc6f3c174c61837862cb9778d73cccd92a8e Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 22 Nov 2024 16:46:35 +0000 Subject: [PATCH 001/330] arm64: Fix usage of new shifted MDCR_EL2 values Since the linked fixes commit, these masks are already shifted so remove the shifts. One issue that this fixes is SPE and TRBE not being available anymore: arm_spe_pmu arm,spe-v1: profiling buffer owned by higher exception level Fixes: 641630313e9c ("arm64: sysreg: Migrate MDCR_EL2 definition to table") Signed-off-by: James Clark Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20241122164636.2944180-1-james.clark@linaro.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 4 ++-- arch/arm64/kernel/hyp-stub.S | 4 ++-- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 4cd41464be3f2..f134907d3c087 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -79,7 +79,7 @@ 1 << PMSCR_EL2_PA_SHIFT) msr_s SYS_PMSCR_EL2, x0 // addresses and physical counter .Lskip_spe_el2_\@: - mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + mov x0, #MDCR_EL2_E2PB_MASK orr x2, x2, x0 // If we don't have VHE, then // use EL1&0 translation. @@ -92,7 +92,7 @@ and x0, x0, TRBIDR_EL1_P cbnz x0, .Lskip_trace_\@ // If TRBE is available at EL2 - mov x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + mov x0, #MDCR_EL2_E2TB_MASK orr x2, x2, x0 // allow the EL1&0 translation // to own it. diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 65f76064c86b2..ae990da1eae5a 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -114,8 +114,8 @@ SYM_CODE_START_LOCAL(__finalise_el2) // Use EL2 translations for SPE & TRBE and disable access from EL1 mrs x0, mdcr_el2 - bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + bic x0, x0, #MDCR_EL2_E2PB_MASK + bic x0, x0, #MDCR_EL2_E2TB_MASK msr mdcr_el2, x0 // Transfer the MM state from EL1 to EL2 diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 01616c39a8107..071993c16de81 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -126,7 +126,7 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) /* Trap SPE */ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + mdcr_clear |= MDCR_EL2_E2PB_MASK; } /* Trap Trace Filter */ @@ -143,7 +143,7 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) /* Trap External Trace */ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + mdcr_clear |= MDCR_EL2_E2TB_MASK; vcpu->arch.mdcr_el2 |= mdcr_set; vcpu->arch.mdcr_el2 &= ~mdcr_clear; From 6fc3a49f23856fdf155ab35f2244295f7870bf83 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 25 Nov 2024 09:47:56 +0000 Subject: [PATCH 002/330] KVM: arm64: Fix S1/S2 combination when FWB==1 and S2 has Device memory type The G.a revision of the ARM ARM had it pretty clear that HCR_EL2.FWB had no influence on "The way that stage 1 memory types and attributes are combined with stage 2 Device type and attributes." (D5.5.5). However, this wording was lost in further revisions of the architecture. Restore the intended behaviour, which is to take the strongest memory type of S1 and S2 in this case, as if FWB was 0. The specification is being fixed accordingly. Fixes: be04cebf3e788 ("KVM: arm64: nv: Add emulation of AT S12E{0,1}{R,W}") Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241125094756.609590-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 8c5d7990e5b31..3d7eb395e33dd 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -739,8 +739,15 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, final_attr = s1_parattr; break; default: - /* MemAttr[2]=0, Device from S2 */ - final_attr = s2_memattr & GENMASK(1,0) << 2; + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); } } else { /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ From 23c44f6c83257923b179461694edcf62749bedd5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Nov 2024 19:13:31 -0800 Subject: [PATCH 003/330] perf tools: Fix build-id event recording The build-id events written at the end of the record session are broken due to unexpected data. The write_buildid() writes the fixed length event first and then variable length filename. But a recent change made it write more data in the padding area accidentally. So readers of the event see zero-filled data for the next entry and treat it incorrectly. This resulted in wrong kernel symbols because the kernel DSO loaded a random vmlinux image in the path as it didn't have a valid build-id. Fixes: ae39ba16554e ("perf inject: Fix build ID injection") Reported-by: Linus Torvalds Tested-by: Arnaldo Carvalho de Melo Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/Z0aRFFW9xMh3mqKB@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/build-id.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 8982f68e7230c..e763e8d99a436 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -277,7 +277,7 @@ static int write_buildid(const char *name, size_t name_len, struct build_id *bid struct perf_record_header_build_id b; size_t len; - len = sizeof(b) + name_len + 1; + len = name_len + 1; len = PERF_ALIGN(len, sizeof(u64)); memset(&b, 0, sizeof(b)); @@ -286,7 +286,7 @@ static int write_buildid(const char *name, size_t name_len, struct build_id *bid misc |= PERF_RECORD_MISC_BUILD_ID_SIZE; b.pid = pid; b.header.misc = misc; - b.header.size = len; + b.header.size = sizeof(b) + len; err = do_write(fd, &b, sizeof(b)); if (err < 0) From f54cd8f43f55c0274c5b51509aff39675639c2e1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 29 Nov 2024 15:19:48 +0000 Subject: [PATCH 004/330] perf test: Don't signal all processes on system when interrupting tests This signal handler loops over all tests on ctrl-C, but it's active while the test list is being constructed. process.pid is 0, then -1, then finally set to the child pid on fork. If the Ctrl-C is received during this point a kill(-1, SIGINT) can be sent which affects all processes. Make sure the child has forked first before forwarding the signal. This can be reproduced with ctrl-C immediately after launching perf test which terminates the ssh connection. Fixes: 553d5efeb341 ("perf test: Add a signal handler to kill forked child processes") Signed-off-by: James Clark Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/20241129151948.3199732-1-james.clark@linaro.org Signed-off-by: Namhyung Kim --- tools/perf/tests/builtin-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 8dcf74d3c0a39..4751dd3c6f672 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -508,7 +508,7 @@ static int __cmd_test(struct test_suite **suites, int argc, const char *argv[], for (size_t x = 0; x < num_tests; x++) { struct child_test *child_test = child_tests[x]; - if (!child_test) + if (!child_test || child_test->process.pid <= 0) continue; pr_debug3("Killing %d pid %d\n", From 514b2262ade48a0503ac6aa03c3bfb8c5be69b21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Nov 2024 00:05:18 +0100 Subject: [PATCH 005/330] firmware: arm_scmi: Fix i.MX build dependency The newly added SCMI vendor driver references functions in the protocol driver but needs a Kconfig dependency to ensure it can link, essentially the Kconfig dependency needs to be reversed to match the link time dependency: | arm-linux-gnueabi-ld: sound/soc/fsl/fsl_mqs.o: in function `fsl_mqs_sm_write': | fsl_mqs.c:(.text+0x1aa): undefined reference to `scmi_imx_misc_ctrl_set' | arm-linux-gnueabi-ld: sound/soc/fsl/fsl_mqs.o: in function `fsl_mqs_sm_read': | fsl_mqs.c:(.text+0x1ee): undefined reference to `scmi_imx_misc_ctrl_get' This however only works after changing the dependency in the SND_SOC_FSL_MQS driver as well, which uses 'select IMX_SCMI_MISC_DRV' to turn on a driver it depends on. This is generally a bad idea, so the best solution is to change that into a dependency. To allow the ASoC driver to keep building with the SCMI support, this needs to be an optional dependency that enforces the link-time dependency if IMX_SCMI_MISC_DRV is a loadable module but not depend on it if that is disabled. Fixes: 61c9f03e22fc ("firmware: arm_scmi: Add initial support for i.MX MISC protocol") Fixes: 101c9023594a ("ASoC: fsl_mqs: Support accessing registers by scmi interface") Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Acked-by: Shengjiu Wang Message-Id: <20241115230555.2435004-1-arnd@kernel.org> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/vendors/imx/Kconfig | 1 + drivers/firmware/imx/Kconfig | 1 - sound/soc/fsl/Kconfig | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/arm_scmi/vendors/imx/Kconfig b/drivers/firmware/arm_scmi/vendors/imx/Kconfig index 2883ed24a84d6..a01bf5e47301d 100644 --- a/drivers/firmware/arm_scmi/vendors/imx/Kconfig +++ b/drivers/firmware/arm_scmi/vendors/imx/Kconfig @@ -15,6 +15,7 @@ config IMX_SCMI_BBM_EXT config IMX_SCMI_MISC_EXT tristate "i.MX SCMI MISC EXTENSION" depends on ARM_SCMI_PROTOCOL || (COMPILE_TEST && OF) + depends on IMX_SCMI_MISC_DRV default y if ARCH_MXC help This enables i.MX System MISC control logic such as gpio expander diff --git a/drivers/firmware/imx/Kconfig b/drivers/firmware/imx/Kconfig index 477d3f32d99a6..907cd149c40a8 100644 --- a/drivers/firmware/imx/Kconfig +++ b/drivers/firmware/imx/Kconfig @@ -25,7 +25,6 @@ config IMX_SCU config IMX_SCMI_MISC_DRV tristate "IMX SCMI MISC Protocol driver" - depends on IMX_SCMI_MISC_EXT || COMPILE_TEST default y if ARCH_MXC help The System Controller Management Interface firmware (SCMI FW) is diff --git a/sound/soc/fsl/Kconfig b/sound/soc/fsl/Kconfig index 8e88830e8e577..678540b782805 100644 --- a/sound/soc/fsl/Kconfig +++ b/sound/soc/fsl/Kconfig @@ -29,8 +29,8 @@ config SND_SOC_FSL_SAI config SND_SOC_FSL_MQS tristate "Medium Quality Sound (MQS) module support" depends on SND_SOC_FSL_SAI + depends on IMX_SCMI_MISC_DRV || !IMX_SCMI_MISC_DRV select REGMAP_MMIO - select IMX_SCMI_MISC_DRV if IMX_SCMI_MISC_EXT !=n help Say Y if you want to add Medium Quality Sound (MQS) support for the Freescale CPUs. From 88a6e2f67cc94f751a74409ab4c21e5fc8ea6757 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Nov 2024 11:47:25 -0300 Subject: [PATCH 006/330] perf machine: Initialize machine->env to address a segfault Its used from trace__run(), for the 'perf trace' live mode, i.e. its strace-like, non-perf.data file processing mode, the most common one. The trace__run() function will set trace->host using machine__new_host() that is supposed to give a machine instance representing the running machine, and since we'll use perf_env__arch_strerrno() to get the right errno -> string table, we need to use machine->env, so initialize it in machine__new_host(). Before the patch: (gdb) run trace --errno-summary -a sleep 1 Summary of events: gvfs-afc-volume (3187), 2 events, 0.0% syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ pselect6 1 0 0.000 0.000 0.000 0.000 0.00% GUsbEventThread (3519), 2 events, 0.0% syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ poll 1 0 0.000 0.000 0.000 0.000 0.00% Program received signal SIGSEGV, Segmentation fault. 0x00000000005caba0 in perf_env__arch_strerrno (env=0x0, err=110) at util/env.c:478 478 if (env->arch_strerrno == NULL) (gdb) bt #0 0x00000000005caba0 in perf_env__arch_strerrno (env=0x0, err=110) at util/env.c:478 #1 0x00000000004b75d2 in thread__dump_stats (ttrace=0x14f58f0, trace=0x7fffffffa5b0, fp=0x7ffff6ff74e0 <_IO_2_1_stderr_>) at builtin-trace.c:4673 #2 0x00000000004b78bf in trace__fprintf_thread (fp=0x7ffff6ff74e0 <_IO_2_1_stderr_>, thread=0x10fa0b0, trace=0x7fffffffa5b0) at builtin-trace.c:4708 #3 0x00000000004b7ad9 in trace__fprintf_thread_summary (trace=0x7fffffffa5b0, fp=0x7ffff6ff74e0 <_IO_2_1_stderr_>) at builtin-trace.c:4747 #4 0x00000000004b656e in trace__run (trace=0x7fffffffa5b0, argc=2, argv=0x7fffffffde60) at builtin-trace.c:4456 #5 0x00000000004ba43e in cmd_trace (argc=2, argv=0x7fffffffde60) at builtin-trace.c:5487 #6 0x00000000004c0414 in run_builtin (p=0xec3068 , argc=5, argv=0x7fffffffde60) at perf.c:351 #7 0x00000000004c06bb in handle_internal_command (argc=5, argv=0x7fffffffde60) at perf.c:404 #8 0x00000000004c0814 in run_argv (argcp=0x7fffffffdc4c, argv=0x7fffffffdc40) at perf.c:448 #9 0x00000000004c0b5d in main (argc=5, argv=0x7fffffffde60) at perf.c:560 (gdb) After: root@number:~# perf trace -a --errno-summary sleep 1 pw-data-loop (2685), 1410 events, 16.0% syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ epoll_wait 188 0 983.428 0.000 5.231 15.595 8.68% ioctl 94 0 0.811 0.004 0.009 0.016 2.82% read 188 0 0.322 0.001 0.002 0.006 5.15% write 141 0 0.280 0.001 0.002 0.018 8.39% timerfd_settime 94 0 0.138 0.001 0.001 0.007 6.47% gnome-control-c (179406), 1848 events, 20.9% syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ poll 222 0 959.577 0.000 4.322 21.414 11.40% recvmsg 150 0 0.539 0.001 0.004 0.013 5.12% write 300 0 0.442 0.001 0.001 0.007 3.29% read 150 0 0.183 0.001 0.001 0.009 5.53% getpid 102 0 0.101 0.000 0.001 0.008 7.82% root@number:~# Fixes: 54373b5d53c1f6aa ("perf env: Introduce perf_env__arch_strerrno()") Reported-by: Veronika Molnarova Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Veronika Molnarova Acked-by: Michael Petlan Tested-by: Michael Petlan Link: https://lore.kernel.org/r/Z0XffUgNSv_9OjOi@x1 Signed-off-by: Namhyung Kim --- tools/perf/util/machine.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 4f0ac998b0ccf..27d5345d2b307 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -134,6 +134,8 @@ struct machine *machine__new_host(void) if (machine__create_kernel_maps(machine) < 0) goto out_delete; + + machine->env = &perf_env; } return machine; From 03c7527e97f73081633d773f9f8c2373f9854b25 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Dec 2024 19:02:36 +0000 Subject: [PATCH 007/330] KVM: arm64: Do not allow ID_AA64MMFR0_EL1.ASIDbits to be overridden Catalin reports that a hypervisor lying to a guest about the size of the ASID field may result in unexpected issues: - if the underlying HW does only supports 8 bit ASIDs, the ASID field in a TLBI VAE1* operation is only 8 bits, and the HW will ignore the other 8 bits - if on the contrary the HW is 16 bit capable, the ASID field in the same TLBI operation is always 16 bits, irrespective of the value of TCR_ELx.AS. This could lead to missed invalidations if the guest was lead to assume that the HW had 8 bit ASIDs while they really are 16 bit wide. In order to avoid any potential disaster that would be hard to debug, prenent the migration between a host with 8 bit ASIDs to one with wider ASIDs (the converse was obviously always forbidden). This is also consistent with what we already do for VMIDs. If it becomes absolutely mandatory to support such a migration path in the future, we will have to trap and emulate all TLBIs, something that nobody should look forward to. Fixes: d5a32b60dc18 ("KVM: arm64: Allow userspace to change ID_AA64MMFR{0-2}_EL1") Reported-by: Catalin Marinas Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Mark Rutland Cc: Marc Zyngier Cc: James Morse Cc: Oliver Upton Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20241203190236.505759-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 83c6b4a07ef56..e2a5c2918d9e5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2618,7 +2618,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_TGRAN4_2 | ID_AA64MMFR0_EL1_TGRAN64_2 | - ID_AA64MMFR0_EL1_TGRAN16_2)), + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | ID_AA64MMFR1_EL1_TWED | From be7e611274224b23776469d7f7ce50e25ac53142 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Sat, 30 Nov 2024 15:49:53 +0100 Subject: [PATCH 008/330] KVM: arm64: vgic-its: Add error handling in vgic_its_cache_translation The return value of xa_store() needs to be checked. This fix adds an error handling path that resolves the kref inconsistency on failure. As suggested by Oliver Upton, this function does not return the error code intentionally because the translation cache is best effort. Fixes: 8201d1028caa ("KVM: arm64: vgic-its: Maintain a translation cache per ITS") Signed-off-by: Keisuke Nishimura Suggested-by: Oliver Upton Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20241130144952.23729-1-keisuke.nishimura@inria.fr Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-its.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index f4c4494645c34..fb96802799c6f 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -608,12 +608,22 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, lockdep_assert_held(&its->its_lock); vgic_get_irq_kref(irq); + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); + + /* + * Put the reference taken on @irq if the store fails. Intentionally do + * not return the error as the translation cache is best effort. + */ + if (xa_is_err(old)) { + vgic_put_irq(kvm, irq); + return; + } + /* * We could have raced with another CPU caching the same * translation behind our back, ensure we don't leak a * reference if that is the case. */ - old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); if (old) vgic_put_irq(kvm, old); } From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: [PATCH 009/330] firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/bus.c | 15 +++++++++++---- drivers/firmware/arm_ffa/driver.c | 7 +------ include/linux/arm_ffa.h | 13 ++++++++----- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index eb17d03b66fec..dfda5ffc14db7 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -187,13 +187,18 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) return valid; } -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { int id, ret; + uuid_t uuid; struct device *dev; struct ffa_device *ffa_dev; + if (!part_info) + return NULL; + id = ida_alloc_min(&ffa_bus_id, 1, GFP_KERNEL); if (id < 0) return NULL; @@ -210,9 +215,11 @@ struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, dev_set_name(&ffa_dev->dev, "arm-ffa-%d", id); ffa_dev->id = id; - ffa_dev->vm_id = vm_id; + ffa_dev->vm_id = part_info->id; + ffa_dev->properties = part_info->properties; ffa_dev->ops = ops; - uuid_copy(&ffa_dev->uuid, uuid); + import_uuid(&uuid, (u8 *)part_info->uuid); + uuid_copy(&ffa_dev->uuid, &uuid); ret = device_register(&ffa_dev->dev); if (ret) { diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index b14cbdae94e82..2c2ec3c35f156 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -1387,7 +1387,6 @@ static struct notifier_block ffa_bus_nb = { static int ffa_setup_partitions(void) { int count, idx, ret; - uuid_t uuid; struct ffa_device *ffa_dev; struct ffa_dev_part_info *info; struct ffa_partition_info *pbuf, *tpbuf; @@ -1406,23 +1405,19 @@ static int ffa_setup_partitions(void) xa_init(&drv_info->partition_info); for (idx = 0, tpbuf = pbuf; idx < count; idx++, tpbuf++) { - import_uuid(&uuid, (u8 *)tpbuf->uuid); - /* Note that if the UUID will be uuid_null, that will require * ffa_bus_notifier() to find the UUID of this partition id * with help of ffa_device_match_uuid(). FF-A v1.1 and above * provides UUID here for each partition as part of the * discovery API and the same is passed. */ - ffa_dev = ffa_device_register(&uuid, tpbuf->id, &ffa_drv_ops); + ffa_dev = ffa_device_register(tpbuf, &ffa_drv_ops); if (!ffa_dev) { pr_err("%s: failed to register partition ID 0x%x\n", __func__, tpbuf->id); continue; } - ffa_dev->properties = tpbuf->properties; - if (drv_info->version > FFA_VERSION_1_0 && !(tpbuf->properties & FFA_PARTITION_AARCH64_EXEC)) ffa_mode_32bit_set(ffa_dev); diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d05..74169dd0f6594 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } From 70327137eb3eb64f346191dcd0ee5140d5ab34d8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Dec 2024 17:23:16 +0100 Subject: [PATCH 010/330] gpio: GPIO_MVEBU should not default to y when compile-testing Merely enabling compile-testing should not enable additional functionality. Fixes: 956ee0c5c969c9ca ("gpio: mvebu: allow building the module with COMPILE_TEST=y") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/6b9e55dbf544297d5acf743f6fa473791ab10644.1733242798.git.geert+renesas@glider.be Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 56fee58e281e7..93ee3aa092f81 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -482,8 +482,9 @@ config GPIO_MT7621 Say yes here to support the Mediatek MT7621 SoC GPIO device. config GPIO_MVEBU - def_bool y + bool "Marvell Orion and EBU GPIO support" if COMPILE_TEST depends on PLAT_ORION || ARCH_MVEBU || COMPILE_TEST + default PLAT_ORION || ARCH_MVEBU select GENERIC_IRQ_CHIP select REGMAP_MMIO From 97264eaaba0122a5b7e8ddd7bf4ff3ac57c2b170 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Thu, 21 Nov 2024 14:47:00 +0300 Subject: [PATCH 011/330] usb: ehci-hcd: fix call balance of clocks handling routines If the clocks priv->iclk and priv->fclk were not enabled in ehci_hcd_sh_probe, they should not be disabled in any path. Conversely, if they was enabled in ehci_hcd_sh_probe, they must be disabled in all error paths to ensure proper cleanup. Found by Linux Verification Center (linuxtesting.org) with Klever. Fixes: 63c845522263 ("usb: ehci-hcd: Add support for SuperH EHCI.") Cc: stable@vger.kernel.org # ff30bd6a6618: sh: clk: Fix clk_enable() to return 0 on NULL clk Signed-off-by: Vitalii Mordan Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20241121114700.2100520-1-mordan@ispras.ru Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-sh.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/ehci-sh.c b/drivers/usb/host/ehci-sh.c index 5d0d972fb7b1b..2d23690d72c50 100644 --- a/drivers/usb/host/ehci-sh.c +++ b/drivers/usb/host/ehci-sh.c @@ -119,8 +119,12 @@ static int ehci_hcd_sh_probe(struct platform_device *pdev) if (IS_ERR(priv->iclk)) priv->iclk = NULL; - clk_enable(priv->fclk); - clk_enable(priv->iclk); + ret = clk_enable(priv->fclk); + if (ret) + goto fail_request_resource; + ret = clk_enable(priv->iclk); + if (ret) + goto fail_iclk; ret = usb_add_hcd(hcd, irq, IRQF_SHARED); if (ret != 0) { @@ -136,6 +140,7 @@ static int ehci_hcd_sh_probe(struct platform_device *pdev) fail_add_hcd: clk_disable(priv->iclk); +fail_iclk: clk_disable(priv->fclk); fail_request_resource: From 282615d1334494063ff8dc70f35168d3f6b9c4c9 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 19 Nov 2024 18:50:17 +0800 Subject: [PATCH 012/330] dt-bindings: phy: imx8mq-usb: correct reference to usb-switch.yaml The i.MX95 usb-phy can work with or without orientation-switch. With current setting, if usb-phy works without orientation-switch, the dt-schema check will show below error: phy@4c1f0040: 'oneOf' conditional failed, one must be fixed: 'port' is a required property 'ports' is a required property from schema $id: http://devicetree.org/schemas/phy/fsl,imx8mq-usb-phy.yaml# This will correct the behavior of referring to usb-switch.yaml. Signed-off-by: Xu Yang Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20241119105017.917833-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/phy/fsl,imx8mq-usb-phy.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/fsl,imx8mq-usb-phy.yaml b/Documentation/devicetree/bindings/phy/fsl,imx8mq-usb-phy.yaml index 6d6d211883aee..daee0c0fc9153 100644 --- a/Documentation/devicetree/bindings/phy/fsl,imx8mq-usb-phy.yaml +++ b/Documentation/devicetree/bindings/phy/fsl,imx8mq-usb-phy.yaml @@ -113,11 +113,8 @@ allOf: maxItems: 1 - if: - properties: - compatible: - contains: - enum: - - fsl,imx95-usb-phy + required: + - orientation-switch then: $ref: /schemas/usb/usb-switch.yaml# From 0d2ada05227881f3d0722ca2364e3f7a860a301f Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 25 Nov 2024 11:14:30 +1300 Subject: [PATCH 013/330] usb: host: max3421-hcd: Correctly abort a USB request. If the current USB request was aborted, the spi thread would not respond to any further requests. This is because the "curr_urb" pointer would not become NULL, so no further requests would be taken off the queue. The solution here is to set the "urb_done" flag, as this will cause the correct handling of the URB. Also clear interrupts that should only be expected if an URB is in progress. Fixes: 2d53139f3162 ("Add support for using a MAX3421E chip as a host driver.") Cc: stable Signed-off-by: Mark Tomlinson Link: https://lore.kernel.org/r/20241124221430.1106080-1-mark.tomlinson@alliedtelesis.co.nz Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/max3421-hcd.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/usb/host/max3421-hcd.c b/drivers/usb/host/max3421-hcd.c index 9fe4f48b18980..0881fdd1823e0 100644 --- a/drivers/usb/host/max3421-hcd.c +++ b/drivers/usb/host/max3421-hcd.c @@ -779,11 +779,17 @@ max3421_check_unlink(struct usb_hcd *hcd) retval = 1; dev_dbg(&spi->dev, "%s: URB %p unlinked=%d", __func__, urb, urb->unlinked); - usb_hcd_unlink_urb_from_ep(hcd, urb); - spin_unlock_irqrestore(&max3421_hcd->lock, - flags); - usb_hcd_giveback_urb(hcd, urb, 0); - spin_lock_irqsave(&max3421_hcd->lock, flags); + if (urb == max3421_hcd->curr_urb) { + max3421_hcd->urb_done = 1; + max3421_hcd->hien &= ~(BIT(MAX3421_HI_HXFRDN_BIT) | + BIT(MAX3421_HI_RCVDAV_BIT)); + } else { + usb_hcd_unlink_urb_from_ep(hcd, urb); + spin_unlock_irqrestore(&max3421_hcd->lock, + flags); + usb_hcd_giveback_urb(hcd, urb, 0); + spin_lock_irqsave(&max3421_hcd->lock, flags); + } } } } From 645d56e4cc74e953284809d096532c1955918a28 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Thu, 21 Nov 2024 11:34:29 +0900 Subject: [PATCH 014/330] usb: typec: anx7411: fix fwnode_handle reference leak An fwnode_handle and usb_role_switch are obtained with an incremented refcount in anx7411_typec_port_probe(), however the refcounts are not decremented in the error path. The fwnode_handle is also not decremented in the .remove() function. Therefore, call fwnode_handle_put() and usb_role_switch_put() accordingly. Fixes: fe6d8a9c8e64 ("usb: typec: anx7411: Add Analogix PD ANX7411 support") Cc: stable@vger.kernel.org Signed-off-by: Joe Hattori Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20241121023429.962848-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/anx7411.c | 47 +++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/drivers/usb/typec/anx7411.c b/drivers/usb/typec/anx7411.c index d1e7c487ddfbb..95607efb9f7e9 100644 --- a/drivers/usb/typec/anx7411.c +++ b/drivers/usb/typec/anx7411.c @@ -1021,6 +1021,16 @@ static void anx7411_port_unregister_altmodes(struct typec_altmode **adev) } } +static void anx7411_port_unregister(struct typec_params *typecp) +{ + fwnode_handle_put(typecp->caps.fwnode); + anx7411_port_unregister_altmodes(typecp->port_amode); + if (typecp->port) + typec_unregister_port(typecp->port); + if (typecp->role_sw) + usb_role_switch_put(typecp->role_sw); +} + static int anx7411_usb_mux_set(struct typec_mux_dev *mux, struct typec_mux_state *state) { @@ -1154,34 +1164,34 @@ static int anx7411_typec_port_probe(struct anx7411_data *ctx, ret = fwnode_property_read_string(fwnode, "power-role", &buf); if (ret) { dev_err(dev, "power-role not found: %d\n", ret); - return ret; + goto put_fwnode; } ret = typec_find_port_power_role(buf); if (ret < 0) - return ret; + goto put_fwnode; cap->type = ret; ret = fwnode_property_read_string(fwnode, "data-role", &buf); if (ret) { dev_err(dev, "data-role not found: %d\n", ret); - return ret; + goto put_fwnode; } ret = typec_find_port_data_role(buf); if (ret < 0) - return ret; + goto put_fwnode; cap->data = ret; ret = fwnode_property_read_string(fwnode, "try-power-role", &buf); if (ret) { dev_err(dev, "try-power-role not found: %d\n", ret); - return ret; + goto put_fwnode; } ret = typec_find_power_role(buf); if (ret < 0) - return ret; + goto put_fwnode; cap->prefer_role = ret; /* Get source pdos */ @@ -1193,7 +1203,7 @@ static int anx7411_typec_port_probe(struct anx7411_data *ctx, typecp->src_pdo_nr); if (ret < 0) { dev_err(dev, "source cap validate failed: %d\n", ret); - return -EINVAL; + goto put_fwnode; } typecp->caps_flags |= HAS_SOURCE_CAP; @@ -1207,7 +1217,7 @@ static int anx7411_typec_port_probe(struct anx7411_data *ctx, typecp->sink_pdo_nr); if (ret < 0) { dev_err(dev, "sink cap validate failed: %d\n", ret); - return -EINVAL; + goto put_fwnode; } for (i = 0; i < typecp->sink_pdo_nr; i++) { @@ -1251,13 +1261,21 @@ static int anx7411_typec_port_probe(struct anx7411_data *ctx, ret = PTR_ERR(ctx->typec.port); ctx->typec.port = NULL; dev_err(dev, "Failed to register type c port %d\n", ret); - return ret; + goto put_usb_role_switch; } typec_port_register_altmodes(ctx->typec.port, NULL, ctx, ctx->typec.port_amode, MAX_ALTMODE); return 0; + +put_usb_role_switch: + if (ctx->typec.role_sw) + usb_role_switch_put(ctx->typec.role_sw); +put_fwnode: + fwnode_handle_put(fwnode); + + return ret; } static int anx7411_typec_check_connection(struct anx7411_data *ctx) @@ -1523,8 +1541,7 @@ static int anx7411_i2c_probe(struct i2c_client *client) destroy_workqueue(plat->workqueue); free_typec_port: - typec_unregister_port(plat->typec.port); - anx7411_port_unregister_altmodes(plat->typec.port_amode); + anx7411_port_unregister(&plat->typec); free_typec_switch: anx7411_unregister_switch(plat); @@ -1548,17 +1565,11 @@ static void anx7411_i2c_remove(struct i2c_client *client) i2c_unregister_device(plat->spi_client); - if (plat->typec.role_sw) - usb_role_switch_put(plat->typec.role_sw); - anx7411_unregister_mux(plat); anx7411_unregister_switch(plat); - if (plat->typec.port) - typec_unregister_port(plat->typec.port); - - anx7411_port_unregister_altmodes(plat->typec.port_amode); + anx7411_port_unregister(&plat->typec); } static const struct i2c_device_id anx7411_id[] = { From ef42b906df5c57d0719b69419df9dfd25f25c161 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 26 Nov 2024 10:49:09 +0900 Subject: [PATCH 015/330] usb: typec: anx7411: fix OF node reference leaks in anx7411_typec_switch_probe() The refcounts of the OF nodes obtained by of_get_child_by_name() calls in anx7411_typec_switch_probe() are not decremented. Replace them with device_get_named_child_node() calls and store the return values to the newly created fwnode_handle fields in anx7411_data, and call fwnode_handle_put() on them in the error path and in the unregister functions. Fixes: e45d7337dc0e ("usb: typec: anx7411: Use of_get_child_by_name() instead of of_find_node_by_name()") Cc: stable@vger.kernel.org Signed-off-by: Joe Hattori Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20241126014909.3687917-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/anx7411.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/usb/typec/anx7411.c b/drivers/usb/typec/anx7411.c index 95607efb9f7e9..0ae0a5ee3fae0 100644 --- a/drivers/usb/typec/anx7411.c +++ b/drivers/usb/typec/anx7411.c @@ -290,6 +290,8 @@ struct anx7411_data { struct power_supply *psy; struct power_supply_desc psy_desc; struct device *dev; + struct fwnode_handle *switch_node; + struct fwnode_handle *mux_node; }; static u8 snk_identity[] = { @@ -1099,6 +1101,7 @@ static void anx7411_unregister_mux(struct anx7411_data *ctx) if (ctx->typec.typec_mux) { typec_mux_unregister(ctx->typec.typec_mux); ctx->typec.typec_mux = NULL; + fwnode_handle_put(ctx->mux_node); } } @@ -1107,6 +1110,7 @@ static void anx7411_unregister_switch(struct anx7411_data *ctx) if (ctx->typec.typec_switch) { typec_switch_unregister(ctx->typec.typec_switch); ctx->typec.typec_switch = NULL; + fwnode_handle_put(ctx->switch_node); } } @@ -1114,28 +1118,29 @@ static int anx7411_typec_switch_probe(struct anx7411_data *ctx, struct device *dev) { int ret; - struct device_node *node; - node = of_get_child_by_name(dev->of_node, "orientation_switch"); - if (!node) + ctx->switch_node = device_get_named_child_node(dev, "orientation_switch"); + if (!ctx->switch_node) return 0; - ret = anx7411_register_switch(ctx, dev, &node->fwnode); + ret = anx7411_register_switch(ctx, dev, ctx->switch_node); if (ret) { dev_err(dev, "failed register switch"); + fwnode_handle_put(ctx->switch_node); return ret; } - node = of_get_child_by_name(dev->of_node, "mode_switch"); - if (!node) { + ctx->mux_node = device_get_named_child_node(dev, "mode_switch"); + if (!ctx->mux_node) { dev_err(dev, "no typec mux exist"); ret = -ENODEV; goto unregister_switch; } - ret = anx7411_register_mux(ctx, dev, &node->fwnode); + ret = anx7411_register_mux(ctx, dev, ctx->mux_node); if (ret) { dev_err(dev, "failed register mode switch"); + fwnode_handle_put(ctx->mux_node); ret = -ENODEV; goto unregister_switch; } From a4faee01179a4d9cbad9ba6be2da8637c68c1438 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 26 Nov 2024 11:28:41 +0800 Subject: [PATCH 016/330] usb: dwc3: imx8mp: fix software node kernel dump When unbind and bind the device again, kernel will dump below warning: [ 173.972130] sysfs: cannot create duplicate filename '/devices/platform/soc/4c010010.usb/software_node' [ 173.981564] CPU: 2 UID: 0 PID: 536 Comm: sh Not tainted 6.12.0-rc6-06344-g2aed7c4a5c56 #144 [ 173.989923] Hardware name: NXP i.MX95 15X15 board (DT) [ 173.995062] Call trace: [ 173.997509] dump_backtrace+0x90/0xe8 [ 174.001196] show_stack+0x18/0x24 [ 174.004524] dump_stack_lvl+0x74/0x8c [ 174.008198] dump_stack+0x18/0x24 [ 174.011526] sysfs_warn_dup+0x64/0x80 [ 174.015201] sysfs_do_create_link_sd+0xf0/0xf8 [ 174.019656] sysfs_create_link+0x20/0x40 [ 174.023590] software_node_notify+0x90/0x100 [ 174.027872] device_create_managed_software_node+0xec/0x108 ... The '4c010010.usb' device is a platform device created during the initcall and is never removed, which causes its associated software node to persist indefinitely. The existing device_create_managed_software_node() does not provide a corresponding removal function. Replace device_create_managed_software_node() with the device_add_software_node() and device_remove_software_node() pair to ensure proper addition and removal of software nodes, addressing this issue. Fixes: a9400f1979a0 ("usb: dwc3: imx8mp: add 2 software managed quirk properties for host mode") Cc: stable@vger.kernel.org Reviewed-by: Frank Li Signed-off-by: Xu Yang Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20241126032841.2458338-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-imx8mp.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-imx8mp.c b/drivers/usb/dwc3/dwc3-imx8mp.c index 356812cbcd884..3edc5aca76f97 100644 --- a/drivers/usb/dwc3/dwc3-imx8mp.c +++ b/drivers/usb/dwc3/dwc3-imx8mp.c @@ -129,6 +129,16 @@ static void dwc3_imx8mp_wakeup_disable(struct dwc3_imx8mp *dwc3_imx) writel(val, dwc3_imx->hsio_blk_base + USB_WAKEUP_CTRL); } +static const struct property_entry dwc3_imx8mp_properties[] = { + PROPERTY_ENTRY_BOOL("xhci-missing-cas-quirk"), + PROPERTY_ENTRY_BOOL("xhci-skip-phy-init-quirk"), + {}, +}; + +static const struct software_node dwc3_imx8mp_swnode = { + .properties = dwc3_imx8mp_properties, +}; + static irqreturn_t dwc3_imx8mp_interrupt(int irq, void *_dwc3_imx) { struct dwc3_imx8mp *dwc3_imx = _dwc3_imx; @@ -148,17 +158,6 @@ static irqreturn_t dwc3_imx8mp_interrupt(int irq, void *_dwc3_imx) return IRQ_HANDLED; } -static int dwc3_imx8mp_set_software_node(struct device *dev) -{ - struct property_entry props[3] = { 0 }; - int prop_idx = 0; - - props[prop_idx++] = PROPERTY_ENTRY_BOOL("xhci-missing-cas-quirk"); - props[prop_idx++] = PROPERTY_ENTRY_BOOL("xhci-skip-phy-init-quirk"); - - return device_create_managed_software_node(dev, props, NULL); -} - static int dwc3_imx8mp_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -221,17 +220,17 @@ static int dwc3_imx8mp_probe(struct platform_device *pdev) if (err < 0) goto disable_rpm; - err = dwc3_imx8mp_set_software_node(dev); + err = device_add_software_node(dev, &dwc3_imx8mp_swnode); if (err) { err = -ENODEV; - dev_err(dev, "failed to create software node\n"); + dev_err(dev, "failed to add software node\n"); goto disable_rpm; } err = of_platform_populate(node, NULL, NULL, dev); if (err) { dev_err(&pdev->dev, "failed to create dwc3 core\n"); - goto disable_rpm; + goto remove_swnode; } dwc3_imx->dwc3 = of_find_device_by_node(dwc3_np); @@ -255,6 +254,8 @@ static int dwc3_imx8mp_probe(struct platform_device *pdev) depopulate: of_platform_depopulate(dev); +remove_swnode: + device_remove_software_node(dev); disable_rpm: pm_runtime_disable(dev); pm_runtime_put_noidle(dev); @@ -268,6 +269,7 @@ static void dwc3_imx8mp_remove(struct platform_device *pdev) pm_runtime_get_sync(dev); of_platform_depopulate(dev); + device_remove_software_node(dev); pm_runtime_disable(dev); pm_runtime_put_noidle(dev); From 82937056967da052cbc04b4435c13db84192dc52 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 27 Nov 2024 08:02:11 +0100 Subject: [PATCH 017/330] usb: gadget: midi2: Fix interpretation of is_midi1 bits The UMP Function Block info m1.0 field (represented by is_midi1 sysfs entry) is an enumeration from 0 to 2, while the midi2 gadget driver incorrectly copies it to the corresponding snd_ump_block_info.flags bits as-is. This made the wrong bit flags set when m1.0 = 2. This patch corrects the wrong interpretation of is_midi1 bits. Fixes: 29ee7a4dddd5 ("usb: gadget: midi2: Add configfs support") Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20241127070213.8232-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi2.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index ee3d9e3578f79..12e866fb311d6 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -1591,7 +1591,11 @@ static int f_midi2_create_card(struct f_midi2 *midi2) fb->info.midi_ci_version = b->midi_ci_version; fb->info.ui_hint = reverse_dir(b->ui_hint); fb->info.sysex8_streams = b->sysex8_streams; - fb->info.flags |= b->is_midi1; + if (b->is_midi1 < 2) + fb->info.flags |= b->is_midi1; + else + fb->info.flags |= SNDRV_UMP_BLOCK_IS_MIDI1 | + SNDRV_UMP_BLOCK_IS_LOWSPEED; strscpy(fb->info.name, ump_fb_name(b), sizeof(fb->info.name)); } From d2ec94fbc431cc77ed53d4480bdc856669c2b5aa Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 5 Nov 2024 17:01:20 +0800 Subject: [PATCH 018/330] usb: core: hcd: only check primary hcd skip_phy_initialization Before commit 53a2d95df836 ("usb: core: add phy notify connect and disconnect"), phy initialization will be skipped even when shared hcd doesn't set skip_phy_initialization flag. However, the situation is changed after the commit. The hcd.c will initialize phy when add shared hcd. This behavior is unexpected for some platforms which will handle phy initialization by themselves. To avoid the issue, this will only check skip_phy_initialization flag of primary hcd since shared hcd normally follow primary hcd setting. Fixes: 53a2d95df836 ("usb: core: add phy notify connect and disconnect") Cc: stable@vger.kernel.org Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20241105090120.2438366-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 500dc35e64774..0b2490347b9fe 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2794,8 +2794,14 @@ int usb_add_hcd(struct usb_hcd *hcd, int retval; struct usb_device *rhdev; struct usb_hcd *shared_hcd; + int skip_phy_initialization; - if (!hcd->skip_phy_initialization) { + if (usb_hcd_is_primary_hcd(hcd)) + skip_phy_initialization = hcd->skip_phy_initialization; + else + skip_phy_initialization = hcd->primary_hcd->skip_phy_initialization; + + if (!skip_phy_initialization) { if (usb_hcd_is_primary_hcd(hcd)) { hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev); if (IS_ERR(hcd->phy_roothub)) From a48f744bef9ee74814a9eccb030b02223e48c76c Mon Sep 17 00:00:00 2001 From: Neal Frager Date: Mon, 2 Dec 2024 23:41:51 +0530 Subject: [PATCH 019/330] usb: dwc3: xilinx: make sure pipe clock is deselected in usb2 only mode When the USB3 PHY is not defined in the Linux device tree, there could still be a case where there is a USB3 PHY active on the board and enabled by the first stage bootloader. If serdes clock is being used then the USB will fail to enumerate devices in 2.0 only mode. To solve this, make sure that the PIPE clock is deselected whenever the USB3 PHY is not defined and guarantees that the USB2 only mode will work in all cases. Fixes: 9678f3361afc ("usb: dwc3: xilinx: Skip resets and USB3 register settings for USB2.0 mode") Cc: stable@vger.kernel.org Signed-off-by: Neal Frager Signed-off-by: Radhey Shyam Pandey Acked-by: Peter Korsgaard Link: https://lore.kernel.org/r/1733163111-1414816-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index e3738e1610db2..a33a42ba0249a 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -121,8 +121,11 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data) * in use but the usb3-phy entry is missing from the device tree. * Therefore, skip these operations in this case. */ - if (!priv_data->usb3_phy) + if (!priv_data->usb3_phy) { + /* Deselect the PIPE Clock Select bit in FPD PIPE Clock register */ + writel(PIPE_CLK_DESELECT, priv_data->regs + XLNX_USB_FPD_PIPE_CLK); goto skip_usb3_phy; + } crst = devm_reset_control_get_exclusive(dev, "usb_crst"); if (IS_ERR(crst)) { From ce15d6b3d5c3c6f78290066be0f0a4fd89cdeb5b Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Tue, 3 Dec 2024 00:18:22 +0530 Subject: [PATCH 020/330] usb: misc: onboard_usb_dev: skip suspend/resume sequence for USB5744 SMBus support USB5744 SMBus initialization is done once in probe() and doing it in resume is not supported so avoid going into suspend and reset the HUB. There is a sysfs property 'always_powered_in_suspend' to implement this feature but since default state should be set to a working configuration so override this property value. It fixes the suspend/resume testcase on Kria KR260 Robotics Starter Kit. Fixes: 6782311d04df ("usb: misc: onboard_usb_dev: add Microchip usb5744 SMBus programming support") Cc: stable@vger.kernel.org Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1733165302-1694891-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/onboard_usb_dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/onboard_usb_dev.c b/drivers/usb/misc/onboard_usb_dev.c index 36b11127280f3..75ac3c6aa92d0 100644 --- a/drivers/usb/misc/onboard_usb_dev.c +++ b/drivers/usb/misc/onboard_usb_dev.c @@ -407,8 +407,10 @@ static int onboard_dev_probe(struct platform_device *pdev) } if (of_device_is_compatible(pdev->dev.of_node, "usb424,2744") || - of_device_is_compatible(pdev->dev.of_node, "usb424,5744")) + of_device_is_compatible(pdev->dev.of_node, "usb424,5744")) { err = onboard_dev_5744_i2c_init(client); + onboard_dev->always_powered_in_suspend = true; + } put_device(&client->dev); if (err < 0) From 4cfbca86f6a8b801f3254e0e3c8f2b1d2d64be2b Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Tue, 3 Dec 2024 12:14:16 +0000 Subject: [PATCH 021/330] usb: gadget: u_serial: Fix the issue that gs_start_io crashed due to accessing null pointer Considering that in some extreme cases, when u_serial driver is accessed by multiple threads, Thread A is executing the open operation and calling the gs_open, Thread B is executing the disconnect operation and calling the gserial_disconnect function,The port->port_usb pointer will be set to NULL. E.g. Thread A Thread B gs_open() gadget_unbind_driver() gs_start_io() composite_disconnect() gs_start_rx() gserial_disconnect() ... ... spin_unlock(&port->port_lock) status = usb_ep_queue() spin_lock(&port->port_lock) spin_lock(&port->port_lock) port->port_usb = NULL gs_free_requests(port->port_usb->in) spin_unlock(&port->port_lock) Crash This causes thread A to access a null pointer (port->port_usb is null) when calling the gs_free_requests function, causing a crash. If port_usb is NULL, the release request will be skipped as it will be done by gserial_disconnect. So add a null pointer check to gs_start_io before attempting to access the value of the pointer port->port_usb. Call trace: gs_start_io+0x164/0x25c gs_open+0x108/0x13c tty_open+0x314/0x638 chrdev_open+0x1b8/0x258 do_dentry_open+0x2c4/0x700 vfs_open+0x2c/0x3c path_openat+0xa64/0xc60 do_filp_open+0xb8/0x164 do_sys_openat2+0x84/0xf0 __arm64_sys_openat+0x70/0x9c invoke_syscall+0x58/0x114 el0_svc_common+0x80/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x38/0x68 Fixes: c1dca562be8a ("usb gadget: split out serial core") Cc: stable@vger.kernel.org Suggested-by: Prashanth K Signed-off-by: Lianqin Hu Acked-by: Prashanth K Link: https://lore.kernel.org/r/TYUPR06MB62178DC3473F9E1A537DCD02D2362@TYUPR06MB6217.apcprd06.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_serial.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c index 0a8c05b2746b4..53d9fc41acc52 100644 --- a/drivers/usb/gadget/function/u_serial.c +++ b/drivers/usb/gadget/function/u_serial.c @@ -579,9 +579,12 @@ static int gs_start_io(struct gs_port *port) * we didn't in gs_start_tx() */ tty_wakeup(port->port.tty); } else { - gs_free_requests(ep, head, &port->read_allocated); - gs_free_requests(port->port_usb->in, &port->write_pool, - &port->write_allocated); + /* Free reqs only if we are still connected */ + if (port->port_usb) { + gs_free_requests(ep, head, &port->read_allocated); + gs_free_requests(port->port_usb->in, &port->write_pool, + &port->write_allocated); + } status = -EIO; } From 336f72d3cbf5cc17df2947bbbd2ba6e2509f17e8 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Mon, 2 Dec 2024 01:16:29 +0100 Subject: [PATCH 022/330] usb: dwc2: Fix HCD resume The Raspberry Pi can suffer on interrupt storms on HCD resume. The dwc2 driver sometimes misses to enable HCD_FLAG_HW_ACCESSIBLE before re-enabling the interrupts. This causes a situation where both handler ignore a incoming port interrupt and force the upper layers to disable the dwc2 interrupt line. This leaves the USB interface in a unusable state: irq 66: nobody cared (try booting with the "irqpoll" option) CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 6.10.0-rc3 Hardware name: BCM2835 Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x50/0x64 dump_stack_lvl from __report_bad_irq+0x38/0xc0 __report_bad_irq from note_interrupt+0x2ac/0x2f4 note_interrupt from handle_irq_event+0x88/0x8c handle_irq_event from handle_level_irq+0xb4/0x1ac handle_level_irq from generic_handle_domain_irq+0x24/0x34 generic_handle_domain_irq from bcm2836_chained_handle_irq+0x24/0x28 bcm2836_chained_handle_irq from generic_handle_domain_irq+0x24/0x34 generic_handle_domain_irq from generic_handle_arch_irq+0x34/0x44 generic_handle_arch_irq from __irq_svc+0x88/0xb0 Exception stack(0xc1b01f20 to 0xc1b01f68) 1f20: 0005c0d4 00000001 00000000 00000000 c1b09780 c1d6b32c c1b04e54 c1a5eae8 1f40: c1b04e90 00000000 00000000 00000000 c1d6a8a0 c1b01f70 c11d2da8 c11d4160 1f60: 60000013 ffffffff __irq_svc from default_idle_call+0x1c/0xb0 default_idle_call from do_idle+0x21c/0x284 do_idle from cpu_startup_entry+0x28/0x2c cpu_startup_entry from kernel_init+0x0/0x12c handlers: [] dwc2_handle_common_intr [<75cd278b>] usb_hcd_irq Disabling IRQ #66 So enable the HCD_FLAG_HW_ACCESSIBLE flag in case there is a port connection. Fixes: c74c26f6e398 ("usb: dwc2: Fix partial power down exiting by system resume") Closes: https://lore.kernel.org/linux-usb/3fd0c2fb-4752-45b3-94eb-42352703e1fd@gmx.net/T/ Link: https://lore.kernel.org/all/5e8cbce0-3260-2971-484f-fc73a3b2bd28@synopsys.com/ Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20241202001631.75473-2-wahrenst@gmx.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index cb54390e7de48..26a320c1979aa 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -4431,6 +4431,7 @@ static int _dwc2_hcd_resume(struct usb_hcd *hcd) * Power Down mode. */ if (hprt0 & HPRT0_CONNSTS) { + set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); hsotg->lx_state = DWC2_L0; goto unlock; } From a8d3e4a734599c7d0f6735f8db8a812e503395dd Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Mon, 2 Dec 2024 01:16:30 +0100 Subject: [PATCH 023/330] usb: dwc2: hcd: Fix GetPortStatus & SetPortFeature On Rasperry Pis without onboard USB hub the power cycle during power connect init only disable the port but never enabled it again: usb usb1-port1: attempt power cycle The port relevant part in dwc2_hcd_hub_control() is skipped in case port_connect_status = 0 under the assumption the core is or will be soon in device mode. But this assumption is wrong, because after ClearPortFeature USB_PORT_FEAT_POWER the port_connect_status will also be 0 and SetPortFeature (incl. USB_PORT_FEAT_POWER) will be a no-op. Fix the behavior of dwc2_hcd_hub_control() by replacing the port_connect_status check with dwc2_is_device_mode(). Link: https://github.com/raspberrypi/linux/issues/6247 Fixes: 7359d482eb4d ("staging: HCD files for the DWC2 driver") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20241202001631.75473-3-wahrenst@gmx.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index 26a320c1979aa..fb4fbd8c43df6 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -3546,11 +3546,9 @@ static int dwc2_hcd_hub_control(struct dwc2_hsotg *hsotg, u16 typereq, port_status |= USB_PORT_STAT_C_OVERCURRENT << 16; } - if (!hsotg->flags.b.port_connect_status) { + if (dwc2_is_device_mode(hsotg)) { /* - * The port is disconnected, which means the core is - * either in device mode or it soon will be. Just - * return 0's for the remainder of the port status + * Just return 0's for the remainder of the port status * since the port register can't be read if the core * is in device mode. */ @@ -3620,13 +3618,11 @@ static int dwc2_hcd_hub_control(struct dwc2_hsotg *hsotg, u16 typereq, if (wvalue != USB_PORT_FEAT_TEST && (!windex || windex > 1)) goto error; - if (!hsotg->flags.b.port_connect_status) { + if (dwc2_is_device_mode(hsotg)) { /* - * The port is disconnected, which means the core is - * either in device mode or it soon will be. Just - * return without doing anything since the port - * register can't be written if the core is in device - * mode. + * Just return 0's for the remainder of the port status + * since the port register can't be read if the core + * is in device mode. */ break; } From 1cf1bd88f129f3bd647fead4dca270a5894274bb Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Mon, 2 Dec 2024 01:16:31 +0100 Subject: [PATCH 024/330] usb: dwc2: Fix HCD port connection race On Raspberry Pis without onboard USB hub frequent device reconnects can trigger a interrupt storm after DWC2 entered host clock gating. This is caused by a race between _dwc2_hcd_suspend() and the port interrupt, which sets port_connect_status. The issue occurs if port_connect_status is still 1, but there is no connection anymore: usb 1-1: USB disconnect, device number 25 dwc2 3f980000.usb: _dwc2_hcd_suspend: port_connect_status: 1 dwc2 3f980000.usb: Entering host clock gating. Disabling IRQ #66 irq 66: nobody cared (try booting with the "irqpoll" option) CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.0-gc1bb81b13202-dirty #322 Hardware name: BCM2835 Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x50/0x64 dump_stack_lvl from __report_bad_irq+0x38/0xc0 __report_bad_irq from note_interrupt+0x2ac/0x2f4 note_interrupt from handle_irq_event+0x88/0x8c handle_irq_event from handle_level_irq+0xb4/0x1ac handle_level_irq from generic_handle_domain_irq+0x24/0x34 generic_handle_domain_irq from bcm2836_chained_handle_irq+0x24/0x28 bcm2836_chained_handle_irq from generic_handle_domain_irq+0x24/0x34 generic_handle_domain_irq from generic_handle_arch_irq+0x34/0x44 generic_handle_arch_irq from __irq_svc+0x88/0xb0 Exception stack(0xc1d01f20 to 0xc1d01f68) 1f20: 0004ef3c 00000001 00000000 00000000 c1d09780 c1f6bb5c c1d04e54 c1c60ca8 1f40: c1d04e94 00000000 00000000 c1d092a8 c1f6af20 c1d01f70 c1211b98 c1212f40 1f60: 60000013 ffffffff __irq_svc from default_idle_call+0x1c/0xb0 default_idle_call from do_idle+0x21c/0x284 do_idle from cpu_startup_entry+0x28/0x2c cpu_startup_entry from kernel_init+0x0/0x12c handlers: [] dwc2_handle_common_intr [<58bf98a3>] usb_hcd_irq Disabling IRQ #66 So avoid this by reading the connection status directly. Fixes: 113f86d0c302 ("usb: dwc2: Update partial power down entering by system suspend") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20241202001631.75473-4-wahrenst@gmx.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index fb4fbd8c43df6..8c3941ecaaf5d 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -4345,7 +4345,7 @@ static int _dwc2_hcd_suspend(struct usb_hcd *hcd) if (hsotg->bus_suspended) goto skip_power_saving; - if (hsotg->flags.b.port_connect_status == 0) + if (!(dwc2_read_hprt0(hsotg) & HPRT0_CONNSTS)) goto skip_power_saving; switch (hsotg->params.power_down) { From e37b383df91ba9bde9c6a31bf3ea9072561c5126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Bartosik?= Date: Tue, 3 Dec 2024 10:23:18 +0000 Subject: [PATCH 025/330] usb: typec: ucsi: Fix completion notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OPM PPM LPM | 1.send cmd | | |-------------------------->| | | |-- | | | | 2.set busy bit in CCI | | |<- | | 3.notify the OPM | | |<--------------------------| | | | 4.send cmd to be executed | | |-------------------------->| | | | | | 5.cmd completed | | |<--------------------------| | | | | |-- | | | | 6.set cmd completed | | |<- bit in CCI | | | | | 7.notify the OPM | | |<--------------------------| | | | | | 8.handle notification | | | from point 3, read CCI | | |<--------------------------| | | | | When the PPM receives command from the OPM (p.1) it sets the busy bit in the CCI (p.2), sends notification to the OPM (p.3) and forwards the command to be executed by the LPM (p.4). When the PPM receives command completion from the LPM (p.5) it sets command completion bit in the CCI (p.6) and sends notification to the OPM (p.7). If command execution by the LPM is fast enough then when the OPM starts handling the notification from p.3 in p.8 and reads the CCI value it will see command completion bit set and will call complete(). Then complete() might be called again when the OPM handles notification from p.7. This fix replaces test_bit() with test_and_clear_bit() in ucsi_notify_common() in order to call complete() only once per request. This fix also reinitializes completion variable in ucsi_sync_control_common() before a command is sent. Fixes: 584e8df58942 ("usb: typec: ucsi: extract common code for command handling") Cc: stable@vger.kernel.org Signed-off-by: Łukasz Bartosik Reviewed-by: Dmitry Baryshkov Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20241203102318.3386345-1-ukaszb@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index c435c0835744a..7a65a7672e183 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -46,11 +46,11 @@ void ucsi_notify_common(struct ucsi *ucsi, u32 cci) ucsi_connector_change(ucsi, UCSI_CCI_CONNECTOR(cci)); if (cci & UCSI_CCI_ACK_COMPLETE && - test_bit(ACK_PENDING, &ucsi->flags)) + test_and_clear_bit(ACK_PENDING, &ucsi->flags)) complete(&ucsi->complete); if (cci & UCSI_CCI_COMMAND_COMPLETE && - test_bit(COMMAND_PENDING, &ucsi->flags)) + test_and_clear_bit(COMMAND_PENDING, &ucsi->flags)) complete(&ucsi->complete); } EXPORT_SYMBOL_GPL(ucsi_notify_common); @@ -65,6 +65,8 @@ int ucsi_sync_control_common(struct ucsi *ucsi, u64 command) else set_bit(COMMAND_PENDING, &ucsi->flags); + reinit_completion(&ucsi->complete); + ret = ucsi->ops->async_control(ucsi, command); if (ret) goto out_clear_bit; From 33ead7e538183b1348ba60af90027240a10de751 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Tue, 3 Dec 2024 12:00:10 -0800 Subject: [PATCH 026/330] usb: typec: ucsi: Fix connector status writing past buffer size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to commit 65c4c9447bfc ("usb: typec: ucsi: Fix a missing bits to bytes conversion in ucsi_init()"), there was a missing conversion from bits to bytes. Here the outcome is worse though: since the value is lower than UCSI_MAX_DATA_LENGTH, instead of bailing out with an error, it writes past the buffer size. The error is then seen in other places like below: Oops: general protection fault, probably for non-canonical address 0x891e812cd0ed968: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 UID: 110 PID: 906 Comm: prometheus-node Not tainted 6.13.0-rc1-xe #1 Hardware name: Intel Corporation Lunar Lake Client Platform/LNL-M LP5 RVP1, BIOS LNLMFWI1.R00.3222.D84.2410171025 10/17/2024 RIP: 0010:power_supply_get_property+0x3e/0xe0 Code: 85 c0 7e 4f 4c 8b 07 89 f3 49 89 d4 49 8b 48 20 48 85 c9 74 72 49 8b 70 18 31 d2 31 c0 eb 0b 83 c2 01 48 63 c2 48 39 c8 73 5d <3b> 1c 86 75 f0 49 8b 40 28 4c 89 e2 89 de ff d0 0f 1f 00 5b 41 5c RSP: 0018:ffffc900017dfa50 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000011 RCX: c963b02c06092008 RDX: 0000000000000000 RSI: 0891e812cd0ed968 RDI: ffff888121dd6800 RBP: ffffc900017dfa68 R08: ffff88810a4024b8 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffc900017dfa78 R13: ffff888121dd6800 R14: ffff888138ad2c00 R15: ffff88810c57c528 FS: 00007713a2ffd6c0(0000) GS:ffff88846f380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c0004b1000 CR3: 0000000121ce8003 CR4: 0000000000f72ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_regs+0x6c/0x80 ? die_addr+0x37/0xa0 ? exc_general_protection+0x1c1/0x440 ? asm_exc_general_protection+0x27/0x30 ? power_supply_get_property+0x3e/0xe0 power_supply_hwmon_read+0x50/0xe0 hwmon_attr_show+0x46/0x170 dev_attr_show+0x1a/0x70 sysfs_kf_seq_show+0xaa/0x120 kernfs_seq_show+0x41/0x60 Just use the buffer size as argument to fix it. Fixes: 226ff2e681d0 ("usb: typec: ucsi: Convert connector specific commands to bitmaps") Signed-off-by: Lucas De Marchi Reviewed-by: Thomas Weißschuh Reviewed-by: Sebastian Reichel Reported-by: Chaitanya Kumar Borah Closes: https://lore.kernel.org/all/SJ1PR11MB6129CCD82CD78D8EE6E27EF4B9362@SJ1PR11MB6129.namprd11.prod.outlook.com/ Suggested-by: Thomas Weißschuh Reviewed-by: Heikki Krogerus Tested-by: Chaitanya Kumar Borah Link: https://lore.kernel.org/r/20241203200010.2821132-1-lucas.demarchi@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 7a65a7672e183..fcf499cc9458c 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -653,7 +653,8 @@ static void ucsi_unregister_altmodes(struct ucsi_connector *con, u8 recipient) static int ucsi_get_connector_status(struct ucsi_connector *con, bool conn_ack) { u64 command = UCSI_GET_CONNECTOR_STATUS | UCSI_CONNECTOR_NUMBER(con->num); - size_t size = min(UCSI_GET_CONNECTOR_STATUS_SIZE, UCSI_MAX_DATA_LENGTH(con->ucsi)); + size_t size = min(sizeof(con->status), + UCSI_MAX_DATA_LENGTH(con->ucsi)); int ret; ret = ucsi_send_command_common(con->ucsi, command, &con->status, size, conn_ack); From 7cc0e0a43a91052477c2921f924a37d9c3891f0c Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 25 Nov 2024 13:58:56 +0200 Subject: [PATCH 027/330] serial: sh-sci: Check if TX data was written to device in .tx_empty() On the Renesas RZ/G3S, when doing suspend to RAM, the uart_suspend_port() is called. The uart_suspend_port() calls 3 times the struct uart_port::ops::tx_empty() before shutting down the port. According to the documentation, the struct uart_port::ops::tx_empty() API tests whether the transmitter FIFO and shifter for the port is empty. The Renesas RZ/G3S SCIFA IP reports the number of data units stored in the transmit FIFO through the FDR (FIFO Data Count Register). The data units in the FIFOs are written in the shift register and transmitted from there. The TEND bit in the Serial Status Register reports if the data was transmitted from the shift register. In the previous code, in the tx_empty() API implemented by the sh-sci driver, it is considered that the TX is empty if the hardware reports the TEND bit set and the number of data units in the FIFO is zero. According to the HW manual, the TEND bit has the following meaning: 0: Transmission is in the waiting state or in progress. 1: Transmission is completed. It has been noticed that when opening the serial device w/o using it and then switch to a power saving mode, the tx_empty() call in the uart_port_suspend() function fails, leading to the "Unable to drain transmitter" message being printed on the console. This is because the TEND=0 if nothing has been transmitted and the FIFOs are empty. As the TEND=0 has double meaning (waiting state, in progress) we can't determined the scenario described above. Add a software workaround for this. This sets a variable if any data has been sent on the serial console (when using PIO) or if the DMA callback has been called (meaning something has been transmitted). In the tx_empty() API the status of the DMA transaction is also checked and if it is completed or in progress the code falls back in checking the hardware registers instead of relying on the software variable. Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.") Cc: stable@vger.kernel.org Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20241125115856.513642-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index df523c7444230..924b803af4400 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -157,6 +157,7 @@ struct sci_port { bool has_rtscts; bool autorts; + bool tx_occurred; }; #define SCI_NPORTS CONFIG_SERIAL_SH_SCI_NR_UARTS @@ -850,6 +851,7 @@ static void sci_transmit_chars(struct uart_port *port) { struct tty_port *tport = &port->state->port; unsigned int stopped = uart_tx_stopped(port); + struct sci_port *s = to_sci_port(port); unsigned short status; unsigned short ctrl; int count; @@ -885,6 +887,7 @@ static void sci_transmit_chars(struct uart_port *port) } sci_serial_out(port, SCxTDR, c); + s->tx_occurred = true; port->icount.tx++; } while (--count > 0); @@ -1241,6 +1244,8 @@ static void sci_dma_tx_complete(void *arg) if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) uart_write_wakeup(port); + s->tx_occurred = true; + if (!kfifo_is_empty(&tport->xmit_fifo)) { s->cookie_tx = 0; schedule_work(&s->work_tx); @@ -1731,6 +1736,19 @@ static void sci_flush_buffer(struct uart_port *port) s->cookie_tx = -EINVAL; } } + +static void sci_dma_check_tx_occurred(struct sci_port *s) +{ + struct dma_tx_state state; + enum dma_status status; + + if (!s->chan_tx) + return; + + status = dmaengine_tx_status(s->chan_tx, s->cookie_tx, &state); + if (status == DMA_COMPLETE || status == DMA_IN_PROGRESS) + s->tx_occurred = true; +} #else /* !CONFIG_SERIAL_SH_SCI_DMA */ static inline void sci_request_dma(struct uart_port *port) { @@ -1740,6 +1758,10 @@ static inline void sci_free_dma(struct uart_port *port) { } +static void sci_dma_check_tx_occurred(struct sci_port *s) +{ +} + #define sci_flush_buffer NULL #endif /* !CONFIG_SERIAL_SH_SCI_DMA */ @@ -2076,6 +2098,12 @@ static unsigned int sci_tx_empty(struct uart_port *port) { unsigned short status = sci_serial_in(port, SCxSR); unsigned short in_tx_fifo = sci_txfill(port); + struct sci_port *s = to_sci_port(port); + + sci_dma_check_tx_occurred(s); + + if (!s->tx_occurred) + return TIOCSER_TEMT; return (status & SCxSR_TEND(port)) && !in_tx_fifo ? TIOCSER_TEMT : 0; } @@ -2247,6 +2275,7 @@ static int sci_startup(struct uart_port *port) dev_dbg(port->dev, "%s(%d)\n", __func__, port->line); + s->tx_occurred = false; sci_request_dma(port); ret = sci_request_irq(s); From f580786ea900c74ed8046e617e5030d4c37a578b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 23 Nov 2024 13:03:25 -0700 Subject: [PATCH 028/330] staging: gpib: Make GPIB_NI_PCI_ISA depend on HAS_IOPORT After commit 78ecb0375685 ("staging: gpib: make port I/O code conditional"), building tnt4882.ko on platforms without HAS_IOPORT (such as hexagon and s390) fails with: ERROR: modpost: "inb_wrapper" [drivers/staging/gpib/tnt4882/tnt4882.ko] undefined! ERROR: modpost: "inw_wrapper" [drivers/staging/gpib/tnt4882/tnt4882.ko] undefined! ERROR: modpost: "nec7210_locking_ioport_write_byte" [drivers/staging/gpib/tnt4882/tnt4882.ko] undefined! ERROR: modpost: "nec7210_locking_ioport_read_byte" [drivers/staging/gpib/tnt4882/tnt4882.ko] undefined! ERROR: modpost: "outb_wrapper" [drivers/staging/gpib/tnt4882/tnt4882.ko] undefined! ERROR: modpost: "outw_wrapper" [drivers/staging/gpib/tnt4882/tnt4882.ko] undefined! Only allow tnt4882.ko to be built when CONFIG_HAS_IOPORT is set to avoid this build failure, as this driver unconditionally needs it. Fixes: 78ecb0375685 ("staging: gpib: make port I/O code conditional") Signed-off-by: Nathan Chancellor Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20241123-gpib-tnt4882-depends-on-has_ioport-v1-1-033c58b64751@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/gpib/Kconfig b/drivers/staging/gpib/Kconfig index 95308d15a5551..9ee4323164654 100644 --- a/drivers/staging/gpib/Kconfig +++ b/drivers/staging/gpib/Kconfig @@ -62,6 +62,7 @@ config GPIB_CEC_PCI config GPIB_NI_PCI_ISA tristate "NI PCI/ISA compatible boards" depends on ISA_BUS || PCI || PCMCIA + depends on HAS_IOPORT select GPIB_COMMON select GPIB_NEC7210 help From 80242c4a9d50e71a4df1b7fa19b922ac88aab22b Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Wed, 4 Dec 2024 14:47:36 +0100 Subject: [PATCH 029/330] staging: gpib: Workaround for ppc build failure Make GPIB_FMH depend on !PPC Reported_by: Stephen Rothwell Link: https://lore.kernel.org/all/20241015165538.634707e5@canb.auug.org.au/ Link: https://lore.kernel.org/r/20241204134736.6660-1-dpenkler@gmail.com Signed-off-by: Dave Penkler Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/gpib/Kconfig b/drivers/staging/gpib/Kconfig index 9ee4323164654..fb865d5b721ea 100644 --- a/drivers/staging/gpib/Kconfig +++ b/drivers/staging/gpib/Kconfig @@ -129,7 +129,7 @@ config GPIB_FMH tristate "FMH FPGA based devices" select GPIB_COMMON select GPIB_NEC7210 - depends on BROKEN + depends on !PPC depends on OF && PCI help GPIB driver for fmhess FPGA based devices From 1d8c2d4b89b40f49ef4a70dcb2eaea43695025ce Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Wed, 4 Dec 2024 15:57:13 +0100 Subject: [PATCH 030/330] staging: gpib: Fix faulty workaround for assignment in if This was detected by Coverity. Add the missing assignment in the else branch of the if Reported-by: Kees Bakker Fixes: fce79512a96a ("staging: gpib: Add LPVO DIY USB GPIB driver") Signed-off-by: Dave Penkler Link: https://lore.kernel.org/r/20241204145713.11889-5-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c index 796c3a5be5451..267651a15fa00 100644 --- a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c +++ b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c @@ -901,7 +901,7 @@ static int usb_gpib_read(gpib_board_t *board, } else { /* we are in the closing sequence */ - + c = nc; if (c == ETX) { c = one_char(board, &b); if (c == ACK) { From 48e8a8160dba523af7074e668b2a458250838a3d Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Wed, 4 Dec 2024 17:21:28 +0100 Subject: [PATCH 031/330] staging: gpib: Fix i386 build issue These drivers cast resource_type_t to void * causing the build to fail. With CONFIG_X86_PAE enabled the resource_size_t type is a 64bit unsigned int which cannot be cast to a 32 bit pointer. Disable these drivers if X68_PAE is enabled Reported-by: Guenter Roeck Closes: https://lore.kernel.org/all/f10e976e-7a04-4454-b38d-39cd18f142da@roeck-us.net/ Fixes: e9dc69956d4d ("staging: gpib: Add Computer Boards GPIB driver") Fixes: e1339245eba3 ("staging: gpib: Add Computer Equipment Corporation GPIB driver") Fixes: bb1bd92fa0f2 ("staging: gpib: Add ines GPIB driver") Fixes: 0cd5b05551e0 ("staging: gpib: Add TNT4882 chip based GPIB driver") Signed-off-by: Dave Penkler Link: https://lore.kernel.org/r/20241204162128.25617-1-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/staging/gpib/Kconfig b/drivers/staging/gpib/Kconfig index fb865d5b721ea..259f3ff336467 100644 --- a/drivers/staging/gpib/Kconfig +++ b/drivers/staging/gpib/Kconfig @@ -50,6 +50,7 @@ config GPIB_CEC_PCI tristate "CEC PCI board" depends on PCI depends on HAS_IOPORT + depends on !X86_PAE select GPIB_COMMON select GPIB_NEC7210 help @@ -63,6 +64,7 @@ config GPIB_NI_PCI_ISA tristate "NI PCI/ISA compatible boards" depends on ISA_BUS || PCI || PCMCIA depends on HAS_IOPORT + depends on !X86_PAE select GPIB_COMMON select GPIB_NEC7210 help @@ -86,6 +88,7 @@ config GPIB_CB7210 tristate "Measurement Computing compatible boards" depends on HAS_IOPORT depends on ISA_BUS || PCI || PCMCIA + depends on !X86_PAE select GPIB_COMMON select GPIB_NEC7210 help @@ -175,6 +178,7 @@ config GPIB_INES tristate "INES" depends on PCI || ISA_BUS || PCMCIA depends on HAS_IOPORT + depends on !X86_PAE select GPIB_COMMON select GPIB_NEC7210 help From 265e98f72bac6c41a4492d3e30a8e5fd22fe0779 Mon Sep 17 00:00:00 2001 From: Suraj Sonawane Date: Mon, 18 Nov 2024 21:56:09 +0530 Subject: [PATCH 032/330] acpi: nfit: vmalloc-out-of-bounds Read in acpi_nfit_ctl Fix an issue detected by syzbot with KASAN: BUG: KASAN: vmalloc-out-of-bounds in cmd_to_func drivers/acpi/nfit/ core.c:416 [inline] BUG: KASAN: vmalloc-out-of-bounds in acpi_nfit_ctl+0x20e8/0x24a0 drivers/acpi/nfit/core.c:459 The issue occurs in cmd_to_func when the call_pkg->nd_reserved2 array is accessed without verifying that call_pkg points to a buffer that is appropriately sized as a struct nd_cmd_pkg. This can lead to out-of-bounds access and undefined behavior if the buffer does not have sufficient space. To address this, a check was added in acpi_nfit_ctl() to ensure that buf is not NULL and that buf_len is less than sizeof(*call_pkg) before accessing it. This ensures safe access to the members of call_pkg, including the nd_reserved2 array. Reported-by: syzbot+7534f060ebda6b8b51b3@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7534f060ebda6b8b51b3 Tested-by: syzbot+7534f060ebda6b8b51b3@syzkaller.appspotmail.com Fixes: ebe9f6f19d80 ("acpi/nfit: Fix bus command validation") Signed-off-by: Suraj Sonawane Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20241118162609.29063-1-surajsonawane0215@gmail.com Signed-off-by: Ira Weiny --- drivers/acpi/nfit/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 5429ec9ef06f0..a5d47819b3a4e 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -454,8 +454,13 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, if (cmd_rc) *cmd_rc = -EINVAL; - if (cmd == ND_CMD_CALL) + if (cmd == ND_CMD_CALL) { + if (!buf || buf_len < sizeof(*call_pkg)) + return -EINVAL; + call_pkg = buf; + } + func = cmd_to_func(nfit_mem, cmd, call_pkg, &family); if (func < 0) return func; From 5fc3a088ee2de55a6b39b7ee18484e01f377ab8a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:39 -0800 Subject: [PATCH 033/330] tools headers: Sync uapi/drm/drm.h with the kernel sources To pick up the changes in this cset: 56c594d8df64e726 ("drm: add DRM_SET_CLIENT_NAME ioctl") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: David Airlie Cc: Simona Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: dri-devel@lists.freedesktop.org Link: https://lore.kernel.org/r/20241203035349.1901262-2-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/drm/drm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 16122819edfef..7fba37b94401a 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -1024,6 +1024,13 @@ struct drm_crtc_queue_sequence { __u64 user_data; /* user data passed to event */ }; +#define DRM_CLIENT_NAME_MAX_LEN 64 +struct drm_set_client_name { + __u64 name_len; + __u64 name; +}; + + #if defined(__cplusplus) } #endif @@ -1288,6 +1295,16 @@ extern "C" { */ #define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) +/** + * DRM_IOCTL_SET_CLIENT_NAME - Attach a name to a drm_file + * + * Having a name allows for easier tracking and debugging. + * The length of the name (without null ending char) must be + * <= DRM_CLIENT_NAME_MAX_LEN. + * The call will fail if the name contains whitespaces or non-printable chars. + */ +#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) + /* * Device specific ioctls should only be in their respective headers * The device specific ioctl range is from 0x40 to 0x9f. From 5229df8fb6796ff4aba39b16cf617028fa5d94b7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:40 -0800 Subject: [PATCH 034/330] tools headers: Sync uapi/linux/perf_event.h with the kernel sources To pick up the changes in this cset: 18d92bb57c39504d ("perf/core: Add aux_pause, aux_resume, aux_start_paused") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Link: https://lore.kernel.org/r/20241203035349.1901262-3-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/perf_event.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4842c36fdf801..0524d541d4e3d 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -511,7 +511,16 @@ struct perf_event_attr { __u16 sample_max_stack; __u16 __reserved_2; __u32 aux_sample_size; - __u32 __reserved_3; + + union { + __u32 aux_action; + struct { + __u32 aux_start_paused : 1, /* start AUX area tracing paused */ + aux_pause : 1, /* on overflow, pause AUX area tracing */ + aux_resume : 1, /* on overflow, resume AUX area tracing */ + __reserved_3 : 29; + }; + }; /* * User provided data if sigtrap=1, passed back to user via From e2064b7c5d82651a418b86078597e51dea3f3123 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:41 -0800 Subject: [PATCH 035/330] tools headers: Sync uapi/linux/kvm.h with the kernel sources To pick up the changes in this cset: e785dfacf7e7fe94 ("LoongArch: KVM: Add PCHPIC device support") 2e8b9df82631e714 ("LoongArch: KVM: Add EIOINTC device support") c532de5a67a70f85 ("LoongArch: KVM: Add IPI device support") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Tianrui Zhao Cc: Bibo Mao Cc: Huacai Chen Cc: kvm@vger.kernel.org Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/20241203035349.1901262-4-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/kvm.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 637efc0551453..502ea63b5d2e7 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1158,7 +1158,15 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA + KVM_DEV_TYPE_LOONGARCH_IPI, +#define KVM_DEV_TYPE_LOONGARCH_IPI KVM_DEV_TYPE_LOONGARCH_IPI + KVM_DEV_TYPE_LOONGARCH_EIOINTC, +#define KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_EIOINTC + KVM_DEV_TYPE_LOONGARCH_PCHPIC, +#define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC + KVM_DEV_TYPE_MAX, + }; struct kvm_vfio_spapr_tce { From e120829dbf927c8b93cd5e06acfec0332cc82e02 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:42 -0800 Subject: [PATCH 036/330] tools headers: Sync x86 kvm and cpufeature headers with the kernel To pick up the changes in this cset: a0423af92cb31e6f ("x86: KVM: Advertise CPUIDs for new instructions in Clearwater Forest") 0c487010cb4f79e4 ("x86/cpufeatures: Add X86_FEATURE_AMD_WORKLOAD_CLASS feature bit") 1ad4667066714369 ("x86/cpufeatures: Add X86_FEATURE_AMD_HETEROGENEOUS_CORES") 104edc6efca62838 ("x86/cpufeatures: Rename X86_FEATURE_FAST_CPPC to have AMD prefix") 3ea87dfa31a7b0bb ("x86/cpufeatures: Add a IBPB_NO_RET BUG flag") ff898623af2ed564 ("x86/cpufeatures: Define X86_FEATURE_AMD_IBPB_RET") dcb988cdac85bad1 ("KVM: x86: Quirk initialization of feature MSRs to KVM's max configuration") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: x86@kernel.org Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20241203035349.1901262-5-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/arch/x86/include/asm/cpufeatures.h | 11 +++++++++-- tools/arch/x86/include/uapi/asm/kvm.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 23698d0f4bb47..17b6590748c00 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -215,7 +215,7 @@ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ #define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier without a guaranteed RSB flush */ #define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ @@ -317,6 +317,9 @@ #define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_SHA512 (12*32+ 0) /* SHA512 instructions */ +#define X86_FEATURE_SM3 (12*32+ 1) /* SM3 instructions */ +#define X86_FEATURE_SM4 (12*32+ 2) /* SM4 instructions */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ @@ -348,6 +351,7 @@ #define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ #define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ #define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_AMD_IBPB_RET (13*32+30) /* IBPB clears return address predictor */ #define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ @@ -472,7 +476,9 @@ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ -#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ +#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */ +#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ +#define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32 + 7) /* Workload Classification */ /* * BUG word(s) @@ -523,4 +529,5 @@ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ +#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index a8debbf2f7028..88585c1de416f 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -440,6 +440,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) +#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 From 76e231997926c8f5707ed5f9c1f91377efe9e484 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:43 -0800 Subject: [PATCH 037/330] tools headers: Sync arm64 kvm header with the kernel sources To pick up the changes in this cset: 97413cea1c48cc05 ("KVM: arm64: Add PSCI v1.3 SYSTEM_OFF2 function for hibernation") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Marc Zyngier Cc: Oliver Upton Cc: kvmarm@lists.linux.dev Link: https://lore.kernel.org/r/20241203035349.1901262-6-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/arch/arm64/include/uapi/asm/kvm.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 964df31da9751..66736ff04011e 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -484,6 +484,12 @@ enum { */ #define KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (1ULL << 0) +/* + * Shutdown caused by a PSCI v1.3 SYSTEM_OFF2 call. + * Valid only when the system event has a type of KVM_SYSTEM_EVENT_SHUTDOWN. + */ +#define KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2 (1ULL << 0) + /* run->fail_entry.hardware_entry_failure_reason codes. */ #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) From 81b483f72289257f5c6f4974150d41e8b499e1e9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:44 -0800 Subject: [PATCH 038/330] tools headers: Sync *xattrat syscall changes with the kernel sources To pick up the changes in this cset: 6140be90ec70c39f ("fs/xattr: add *at family syscalls") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h diff -u tools/perf/arch/x86/entry/syscalls/syscall_32.tbl arch/x86/entry/syscalls/syscall_32.tbl diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl The arm64 changes are not included as it requires more changes in the tools. It'll be worked for the later cycle. Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Arnd Bergmann Cc: Christian Brauner CC: x86@kernel.org CC: linux-mips@vger.kernel.org CC: linuxppc-dev@lists.ozlabs.org CC: linux-s390@vger.kernel.org Link: https://lore.kernel.org/r/20241203035349.1901262-7-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/asm-generic/unistd.h | 11 ++++++++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 4 ++++ tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4 ++++ tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4 ++++ tools/perf/arch/x86/entry/syscalls/syscall_32.tbl | 4 ++++ tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4 ++++ 6 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 5bf6148cac2b9..88dc393c2bca3 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -841,8 +841,17 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) #define __NR_mseal 462 __SYSCALL(__NR_mseal, sys_mseal) +#define __NR_setxattrat 463 +__SYSCALL(__NR_setxattrat, sys_setxattrat) +#define __NR_getxattrat 464 +__SYSCALL(__NR_getxattrat, sys_getxattrat) +#define __NR_listxattrat 465 +__SYSCALL(__NR_listxattrat, sys_listxattrat) +#define __NR_removexattrat 466 +__SYSCALL(__NR_removexattrat, sys_removexattrat) + #undef __NR_syscalls -#define __NR_syscalls 463 +#define __NR_syscalls 467 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 1464c6be6eb3c..c844cd5cda620 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -377,3 +377,7 @@ 460 n64 lsm_set_self_attr sys_lsm_set_self_attr 461 n64 lsm_list_modules sys_lsm_list_modules 462 n64 mseal sys_mseal +463 n64 setxattrat sys_setxattrat +464 n64 getxattrat sys_getxattrat +465 n64 listxattrat sys_listxattrat +466 n64 removexattrat sys_removexattrat diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index ebae8415dfbba..d8b4ab78bef07 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -553,3 +553,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 01071182763e9..e9115b4d8b635 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -465,3 +465,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal sys_mseal +463 common setxattrat sys_setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat sys_removexattrat diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl index 534c74b14fab5..4d0fb2fba7e20 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl @@ -468,3 +468,7 @@ 460 i386 lsm_set_self_attr sys_lsm_set_self_attr 461 i386 lsm_list_modules sys_lsm_list_modules 462 i386 mseal sys_mseal +463 i386 setxattrat sys_setxattrat +464 i386 getxattrat sys_getxattrat +465 i386 listxattrat sys_listxattrat +466 i386 removexattrat sys_removexattrat diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 7093ee21c0d1c..5eb708bff1c79 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -386,6 +386,10 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat # # Due to a historical design error, certain syscalls are numbered differently From 3cef7d8b12811dfd737e77e69d9ef72bc55365cf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:45 -0800 Subject: [PATCH 039/330] tools headers: Sync uapi/asm-generic/mman.h with the kernel sources To pick up the changes in this cset: 3630e82ab6bd2642 ("mman: Add map_shadow_stack() flags") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/asm-generic/mman.h include/uapi/asm-generic/mman.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Arnd Bergmann Cc: Mark Brown Cc: Catalin Marinas Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/r/20241203035349.1901262-8-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/asm-generic/mman.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/asm-generic/mman.h b/tools/include/uapi/asm-generic/mman.h index 406f7718f9ad0..51d2556af54ac 100644 --- a/tools/include/uapi/asm-generic/mman.h +++ b/tools/include/uapi/asm-generic/mman.h @@ -19,4 +19,8 @@ #define MCL_FUTURE 2 /* lock all future mappings */ #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ +#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack marker in the shadow stack */ + + #endif /* __ASM_GENERIC_MMAN_H */ From 6d442c69cbe0f8ab47177b8af125e6c88b9e8cd0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:46 -0800 Subject: [PATCH 040/330] tools headers: Sync uapi/linux/fcntl.h with the kernel sources To pick up the changes in this cset: c374196b2b9f4b80 ("fs: name_to_handle_at() support for "explicit connectable" file handles") 95f567f81e43a1bc ("fs: Simplify getattr interface function checking AT_GETATTR_NOSEC flag") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/linux/fcntl.h include/uapi/linux/fcntl.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Jeff Layton Cc: Chuck Lever Cc: Alexander Aring Cc: Christian Brauner Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20241203035349.1901262-9-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/trace/beauty/include/uapi/linux/fcntl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h index 87e2dec79fea4..6e6907e63bfc2 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -153,9 +153,6 @@ object identity and may not be usable with open_by_handle_at(2). */ #define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */ - -#if defined(__KERNEL__) -#define AT_GETATTR_NOSEC 0x80000000 -#endif +#define AT_HANDLE_CONNECTABLE 0x002 /* Request a connectable file handle */ #endif /* _UAPI_LINUX_FCNTL_H */ From 02116fcfd827fe17dbdb543f111cdd9b8b32f18e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:47 -0800 Subject: [PATCH 041/330] tools headers: Sync uapi/linux/mount.h with the kernel sources To pick up the changes in this cset: aefff51e1c2986e1 statmount: retrieve security mount options 2f4d4503e9e5ab76 statmount: add flag to retrieve unescaped options 44010543fc8bedad fs: add the ability for statmount() to report the sb_source ed9d95f691c29748 fs: add the ability for statmount() to report the fs_subtype This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/linux/mount.h include/uapi/linux/mount.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Christian Brauner Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20241203035349.1901262-10-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/trace/beauty/include/uapi/linux/mount.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/perf/trace/beauty/include/uapi/linux/mount.h b/tools/perf/trace/beauty/include/uapi/linux/mount.h index 225bc366ffcbf..c07008816acae 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/mount.h +++ b/tools/perf/trace/beauty/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 mnt_opts; /* [str] Mount options of the mount */ + __u32 mnt_opts; /* [str] Options (comma separated, escaped) */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -173,7 +173,13 @@ struct statmount { __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ __u64 mnt_ns_id; /* ID of the mount namespace */ - __u64 __spare2[49]; + __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ + __u32 sb_source; /* [str] Source string of the mount */ + __u32 opt_num; /* Number of fs options */ + __u32 opt_array; /* [str] Array of nul terminated fs options */ + __u32 opt_sec_num; /* Number of security options */ + __u32 opt_sec_array; /* [str] Array of nul terminated security options */ + __u64 __spare2[46]; char str[]; /* Variable size part containing strings */ }; @@ -207,6 +213,10 @@ struct mnt_id_req { #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ +#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ +#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ +#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ +#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ /* * Special @mnt_id values that can be passed to listmount From c994ac74cc3683b1cca45bc425ea1c625da109f3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:48 -0800 Subject: [PATCH 042/330] tools headers: Sync uapi/linux/prctl.h with the kernel sources To pick up the changes in this cset: 09d6775f503b393d riscv: Add support for userspace pointer masking 91e102e79740ae43 prctl: arch-agnostic prctl for shadow stack This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Mark Brown Cc: Palmer Dabbelt Link: https://lore.kernel.org/r/20241203035349.1901262-11-namhyung@kernel.org Signed-off-by: Namhyung Kim --- .../trace/beauty/include/uapi/linux/prctl.h | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h index 35791791a879b..5c6080680cb27 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -230,7 +230,7 @@ struct prctl_mm_map { # define PR_PAC_APDBKEY (1UL << 3) # define PR_PAC_APGAKEY (1UL << 4) -/* Tagged user address controls for arm64 */ +/* Tagged user address controls for arm64 and RISC-V */ #define PR_SET_TAGGED_ADDR_CTRL 55 #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) @@ -244,6 +244,9 @@ struct prctl_mm_map { # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) /* Unused; kept only for source compatibility */ # define PR_MTE_TCF_SHIFT 1 +/* RISC-V pointer masking tag length */ +# define PR_PMLEN_SHIFT 24 +# define PR_PMLEN_MASK (0x7fUL << PR_PMLEN_SHIFT) /* Control reclaim behavior when allocating memory */ #define PR_SET_IO_FLUSHER 57 @@ -328,4 +331,26 @@ struct prctl_mm_map { # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ # define PR_PPC_DEXCR_CTRL_MASK 0x1f +/* + * Get the current shadow stack configuration for the current thread, + * this will be the value configured via PR_SET_SHADOW_STACK_STATUS. + */ +#define PR_GET_SHADOW_STACK_STATUS 74 + +/* + * Set the current shadow stack configuration. Enabling the shadow + * stack will cause a shadow stack to be allocated for the thread. + */ +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +/* + * Prevent further changes to the specified shadow stack + * configuration. All bits may be locked via this call, including + * undefined bits. + */ +#define PR_LOCK_SHADOW_STACK_STATUS 76 + #endif /* _LINUX_PRCTL_H */ From 968121f0a649ee2ff5ac7b29aa4d4f6b0798ea9e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:49 -0800 Subject: [PATCH 043/330] perf tools: Fix build error on generated/fs_at_flags_array.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It should only have generic flags in the array but the recent header sync brought a new flags to fcntl.h and caused a build error. Let's update the shell script to exclude flags specific to name_to_handle_at(). CC trace/beauty/fs_at_flags.o In file included from trace/beauty/fs_at_flags.c:21: tools/perf/trace/beauty/generated/fs_at_flags_array.c:13:30: error: initialized field overwritten [-Werror=override-init] 13 | [ilog2(0x002) + 1] = "HANDLE_CONNECTABLE", | ^~~~~~~~~~~~~~~~~~~~ tools/perf/trace/beauty/generated/fs_at_flags_array.c:13:30: note: (near initialization for ‘fs_at_flags[2]’) Reviewed-by: James Clark Link: https://lore.kernel.org/r/20241203035349.1901262-12-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/trace/beauty/fs_at_flags.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/fs_at_flags.sh b/tools/perf/trace/beauty/fs_at_flags.sh index e3f13f96a27c2..fac4d0c049fcc 100755 --- a/tools/perf/trace/beauty/fs_at_flags.sh +++ b/tools/perf/trace/beauty/fs_at_flags.sh @@ -13,13 +13,14 @@ printf "static const char *fs_at_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+AT_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' # AT_EACCESS is only meaningful to faccessat, so we will special case it there... # AT_STATX_SYNC_TYPE is not a bit, its a mask of AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC and AT_STATX_DONT_SYNC -# AT_HANDLE_FID and AT_HANDLE_MNT_ID_UNIQUE are reusing values and are valid only for name_to_handle_at() +# AT_HANDLE_FID, AT_HANDLE_MNT_ID_UNIQUE and AT_HANDLE_CONNECTABLE are reusing values and are valid only for name_to_handle_at() # AT_RENAME_NOREPLACE reuses 0x1 and is valid only for renameat2() grep -E $regex ${linux_fcntl} | \ grep -v AT_EACCESS | \ grep -v AT_STATX_SYNC_TYPE | \ grep -v AT_HANDLE_FID | \ grep -v AT_HANDLE_MNT_ID_UNIQUE | \ + grep -v AT_HANDLE_CONNECTABLE | \ grep -v AT_RENAME_NOREPLACE | \ sed -r "s/$regex/\2 \1/g" | \ xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n" From 4f776d81bf927a4f25d5e32a4d0df08ee509dd6c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (Arm)" Date: Thu, 28 Nov 2024 20:55:43 +0530 Subject: [PATCH 044/330] arm64: dts: fvp: Update PCIe bus-range property These days, the Fixed Virtual Platforms(FVP) Base RevC model supports more PCI devices. Update the max bus number so that Linux can enumerate them correctly. Without this, the kernel throws the below error while booting with the default hierarchy | pci_bus 0000:01: busn_res: [bus 01] end is updated to 01 | pci_bus 0000:02: busn_res: can not insert [bus 02-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:02: busn_res: [bus 02-01] end is updated to 02 | pci_bus 0000:02: busn_res: can not insert [bus 02] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:03: busn_res: can not insert [bus 03-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:03: busn_res: [bus 03-01] end is updated to 03 | pci_bus 0000:03: busn_res: can not insert [bus 03] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:04: busn_res: can not insert [bus 04-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:04: busn_res: [bus 04-01] end is updated to 04 | pci_bus 0000:04: busn_res: can not insert [bus 04] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci 0000:00:01.0: BAR 14: assigned [mem 0x50000000-0x500fffff] | pci-host-generic 40000000.pci: ECAM at [mem 0x40000000-0x4fffffff] | for [bus 00-01] The change is using 0xff as max bus number because the ECAM window is 256MB in size. Below is the lspci output with and without the change: without fix =========== | 00:00.0 Host bridge: ARM Device 00ba (rev 01) | 00:01.0 PCI bridge: ARM Device 0def | 00:02.0 PCI bridge: ARM Device 0def | 00:03.0 PCI bridge: ARM Device 0def | 00:04.0 PCI bridge: ARM Device 0def | 00:1e.0 Unassigned class [ff00]: ARM Device ff80 | 00:1e.1 Unassigned class [ff00]: ARM Device ff80 | 00:1f.0 SATA controller: Device 0abc:aced (rev 01) | 01:00.0 SATA controller: Device 0abc:aced (rev 01) with fix ======== | 00:00.0 Host bridge: ARM Device 00ba (rev 01) | 00:01.0 PCI bridge: ARM Device 0def | 00:02.0 PCI bridge: ARM Device 0def | 00:03.0 PCI bridge: ARM Device 0def | 00:04.0 PCI bridge: ARM Device 0def | 00:1e.0 Unassigned class [ff00]: ARM Device ff80 | 00:1e.1 Unassigned class [ff00]: ARM Device ff80 | 00:1f.0 SATA controller: Device 0abc:aced (rev 01) | 01:00.0 SATA controller: Device 0abc:aced (rev 01) | 02:00.0 Unassigned class [ff00]: ARM Device ff80 | 02:00.4 Unassigned class [ff00]: ARM Device ff80 | 03:00.0 PCI bridge: ARM Device 0def | 04:00.0 PCI bridge: ARM Device 0def | 04:01.0 PCI bridge: ARM Device 0def | 04:02.0 PCI bridge: ARM Device 0def | 05:00.0 SATA controller: Device 0abc:aced (rev 01) | 06:00.0 Unassigned class [ff00]: ARM Device ff80 | 06:00.7 Unassigned class [ff00]: ARM Device ff80 | 07:00.0 Unassigned class [ff00]: ARM Device ff80 | 07:00.3 Unassigned class [ff00]: ARM Device ff80 | 08:00.0 Unassigned class [ff00]: ARM Device ff80 | 08:00.1 Unassigned class [ff00]: ARM Device ff80 Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: Rob Herring Cc: Krzysztof Kozlowski Cc: Conor Dooley Reviewed-by: Liviu Dudau Signed-off-by: Aneesh Kumar K.V (Arm) Message-Id: <20241128152543.1821878-1-aneesh.kumar@kernel.org> Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/fvp-base-revc.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts b/arch/arm64/boot/dts/arm/fvp-base-revc.dts index 19973ab4ea6b5..9e10d7a6b5a2c 100644 --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts @@ -233,7 +233,7 @@ #interrupt-cells = <0x1>; compatible = "pci-host-ecam-generic"; device_type = "pci"; - bus-range = <0x0 0x1>; + bus-range = <0x0 0xff>; reg = <0x0 0x40000000 0x0 0x10000000>; ranges = <0x2000000 0x0 0x50000000 0x0 0x50000000 0x0 0x10000000>; interrupt-map = <0 0 0 1 &gic 0 0 GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>, From 9f4ddfdc2c03956d278bdafca6adc21cf90cc834 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Mon, 2 Dec 2024 23:29:31 -0500 Subject: [PATCH 045/330] Revert "drm/amdgpu: Fix ISP hw init issue" This reverts commit 274e3f4596446955bf17680fd4eb5489f5ecac00. Additional review comments to address. Will resubmit. Reviewed-by: Mario Limonciello Signed-off-by: Pratap Nirujogi Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 96316111300a7..23ad65e336d51 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -145,7 +145,7 @@ const char *amdgpu_asic_name[] = { "LAST", }; -#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM, 0) +#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) /* * Default init level where all blocks are expected to be initialized. This is * the level of initialization expected by default and also after a full reset From 9a4ab400f1fad0e6e8686b8f5fc5376383860ce8 Mon Sep 17 00:00:00 2001 From: Victor Zhao Date: Sat, 23 Nov 2024 18:37:43 +0800 Subject: [PATCH 046/330] drm/amdgpu: use sjt mec fw on gfx943 for sriov Use second jump table in sriov for live migration or mulitple VF support so different VF can load different version of MEC as long as they support sjt Signed-off-by: Victor Zhao Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index e2b3dda57030c..54459254bd37b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -45,6 +45,8 @@ MODULE_FIRMWARE("amdgpu/gc_9_4_3_mec.bin"); MODULE_FIRMWARE("amdgpu/gc_9_4_4_mec.bin"); MODULE_FIRMWARE("amdgpu/gc_9_4_3_rlc.bin"); MODULE_FIRMWARE("amdgpu/gc_9_4_4_rlc.bin"); +MODULE_FIRMWARE("amdgpu/gc_9_4_3_sjt_mec.bin"); +MODULE_FIRMWARE("amdgpu/gc_9_4_4_sjt_mec.bin"); #define GFX9_MEC_HPD_SIZE 4096 #define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L @@ -574,8 +576,12 @@ static int gfx_v9_4_3_init_cp_compute_microcode(struct amdgpu_device *adev, { int err; - err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw, - "amdgpu/%s_mec.bin", chip_name); + if (amdgpu_sriov_vf(adev)) + err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw, + "amdgpu/%s_sjt_mec.bin", chip_name); + else + err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw, + "amdgpu/%s_mec.bin", chip_name); if (err) goto out; amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_MEC1); From 12f325bcd2411e571dbb500bf6862c812c479735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 29 Nov 2024 14:19:21 +0100 Subject: [PATCH 047/330] drm/amdgpu: fix UVD contiguous CS mapping problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When starting the mpv player, Radeon R9 users are observing the below error in dmesg. [drm:amdgpu_uvd_cs_pass2 [amdgpu]] *ERROR* msg/fb buffer ff00f7c000-ff00f7e000 out of 256MB segment! The patch tries to set the TTM_PL_FLAG_CONTIGUOUS for both user flag(AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) set and not set cases. v2: Make the TTM_PL_FLAG_CONTIGUOUS mandatory for user BO's. v3: revert back to v1, but fix the check instead (chk). Closes:https://gitlab.freedesktop.org/drm/amd/-/issues/3599 Closes:https://gitlab.freedesktop.org/drm/amd/-/issues/3501 Signed-off-by: Arunpravin Paneer Selvam Signed-off-by: Christian König Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.10+ --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 17 +++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index d891ab779ca7f..5df21529b3b13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1801,13 +1801,18 @@ int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, if (dma_resv_locking_ctx((*bo)->tbo.base.resv) != &parser->exec.ticket) return -EINVAL; + /* Make sure VRAM is allocated contigiously */ (*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; - amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains); - for (i = 0; i < (*bo)->placement.num_placement; i++) - (*bo)->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; - r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx); - if (r) - return r; + if ((*bo)->tbo.resource->mem_type == TTM_PL_VRAM && + !((*bo)->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + + amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains); + for (i = 0; i < (*bo)->placement.num_placement; i++) + (*bo)->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; + r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx); + if (r) + return r; + } return amdgpu_ttm_alloc_gart(&(*bo)->tbo); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 31fd30dcd593b..65bb26215e867 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -551,6 +551,8 @@ static void amdgpu_uvd_force_into_uvd_segment(struct amdgpu_bo *abo) for (i = 0; i < abo->placement.num_placement; ++i) { abo->placements[i].fpfn = 0 >> PAGE_SHIFT; abo->placements[i].lpfn = (256 * 1024 * 1024) >> PAGE_SHIFT; + if (abo->placements[i].mem_type == TTM_PL_VRAM) + abo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } } From 47f402a3e08113e0f5d8e1e6fcc197667a16022f Mon Sep 17 00:00:00 2001 From: "David (Ming Qiang) Wu" Date: Wed, 4 Dec 2024 11:30:01 -0500 Subject: [PATCH 048/330] amdgpu/uvd: get ring reference from rq scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit base.sched may not be set for each instance and should not be used for cases such as non-IB tests. Fixes: 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") Signed-off-by: David (Ming Qiang) Wu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c index 079131aeb2f78..3c8ab8698af83 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c @@ -1288,7 +1288,7 @@ static int uvd_v7_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p, struct amdgpu_job *job, struct amdgpu_ib *ib) { - struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); + struct amdgpu_ring *ring = amdgpu_job_ring(job); unsigned i; /* No patching necessary for the first instance */ From 8eb966f2403abb844e972fb4eb1348640111f121 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 4 Dec 2024 13:11:28 +0530 Subject: [PATCH 049/330] drm/amd/pm: Initialize power profile mode Refactor such that individual SMU IP versions can choose the startup power profile mode. If no preference, then use the generic default power profile selection logic. Signed-off-by: Lijo Lazar Reviewed-by: Yang Wang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- .../gpu/drm/amd/include/kgd_pp_interface.h | 1 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h index 67a5de5739436..d7acdd42d80f6 100644 --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h @@ -164,6 +164,7 @@ enum amd_pp_task { }; enum PP_SMC_POWER_PROFILE { + PP_SMC_POWER_PROFILE_UNKNOWN = -1, PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT = 0x0, PP_SMC_POWER_PROFILE_FULLSCREEN3D = 0x1, PP_SMC_POWER_PROFILE_POWERSAVING = 0x2, diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 5eae14fe79f17..21bd635bcdfc1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -764,6 +764,7 @@ static int smu_early_init(struct amdgpu_ip_block *ip_block) smu->smu_baco.platform_support = false; smu->smu_baco.maco_support = false; smu->user_dpm_profile.fan_mode = -1; + smu->power_profile_mode = PP_SMC_POWER_PROFILE_UNKNOWN; mutex_init(&smu->message_lock); @@ -1248,6 +1249,21 @@ static bool smu_is_workload_profile_available(struct smu_context *smu, return smu->workload_map && smu->workload_map[profile].valid_mapping; } +static void smu_init_power_profile(struct smu_context *smu) +{ + if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_UNKNOWN) { + if (smu->is_apu || + !smu_is_workload_profile_available( + smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D)) + smu->power_profile_mode = + PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; + else + smu->power_profile_mode = + PP_SMC_POWER_PROFILE_FULLSCREEN3D; + } + smu_power_profile_mode_get(smu, smu->power_profile_mode); +} + static int smu_sw_init(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -1269,13 +1285,7 @@ static int smu_sw_init(struct amdgpu_ip_block *ip_block) atomic_set(&smu->smu_power.power_gate.vpe_gated, 1); atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1); - if (smu->is_apu || - !smu_is_workload_profile_available(smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D)) - smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; - else - smu->power_profile_mode = PP_SMC_POWER_PROFILE_FULLSCREEN3D; - smu_power_profile_mode_get(smu, smu->power_profile_mode); - + smu_init_power_profile(smu); smu->display_config = &adev->pm.pm_display_cfg; smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO; From 3912a78cf72eb45f8153a395162b08fef9c5ec3d Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Wed, 4 Dec 2024 13:22:10 +0530 Subject: [PATCH 050/330] drm/amd/pm: Set SMU v13.0.7 default workload type Set the default workload type to bootup type on smu v13.0.7. This is because of the constraint on smu v13.0.7. Gfx activity has an even higher set point on 3D fullscreen mode than the one on bootup mode. This causes the 3D fullscreen mode's performance is worse than the bootup mode's performance for the lightweighted/medium workload. For the high workload, the performance is the same between 3D fullscreen mode and bootup mode. v2: set the default workload in ASIC specific file Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index f4ac403b8b360..aabb947960056 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2810,4 +2810,5 @@ void smu_v13_0_7_set_ppt_funcs(struct smu_context *smu) smu->workload_map = smu_v13_0_7_workload_map; smu->smc_driver_if_version = SMU13_0_7_DRIVER_IF_VERSION; smu_v13_0_set_smu_mailbox_registers(smu); + smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; } From 5c3de6b02d38eb9386edf50490e050bb44398e40 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Tue, 5 Nov 2024 09:57:42 +0800 Subject: [PATCH 051/330] drm/amdkfd: Correct the migration DMA map direction The SVM DMA device map direction should be set the same as the DMA unmap setting, otherwise the DMA core will report the following warning. Before finialize this solution, there're some discussion on the DMA mapping type(stream-based or coherent) in this KFD migration case, followed by https://lore.kernel.org/all/04d4ab32 -45a1-4b88-86ee-fb0f35a0ca40@amd.com/T/. As there's no dma_sync_single_for_*() in the DMA buffer accessed that because this migration operation should be sync properly and automatically. Give that there's might not be a performance problem in various cache sync policy of DMA sync. Therefore, in order to simplify the DMA direction setting alignment, let's set the DMA map direction as BIDIRECTIONAL. [ 150.834218] WARNING: CPU: 8 PID: 1812 at kernel/dma/debug.c:1028 check_unmap+0x1cc/0x930 [ 150.834225] Modules linked in: amdgpu(OE) amdxcp drm_exec(OE) gpu_sched drm_buddy(OE) drm_ttm_helper(OE) ttm(OE) drm_suballoc_helper(OE) drm_display_helper(OE) drm_kms_helper(OE) i2c_algo_bit rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace netfs xt_conntrack xt_MASQUERADE nf_conntrack_netlink xfrm_user xfrm_algo iptable_nat xt_addrtype iptable_filter br_netfilter nvme_fabrics overlay nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c bridge stp llc sch_fq_codel intel_rapl_msr amd_atl intel_rapl_common snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg edac_mce_amd snd_pci_acp6x snd_hda_codec snd_acp_config snd_hda_core snd_hwdep snd_soc_acpi kvm_amd sunrpc snd_pcm kvm binfmt_misc snd_seq_midi crct10dif_pclmul snd_seq_midi_event ghash_clmulni_intel sha512_ssse3 snd_rawmidi nls_iso8859_1 sha256_ssse3 sha1_ssse3 snd_seq aesni_intel snd_seq_device crypto_simd snd_timer cryptd input_leds [ 150.834310] wmi_bmof serio_raw k10temp rapl snd sp5100_tco ipmi_devintf soundcore ccp ipmi_msghandler cm32181 industrialio mac_hid msr parport_pc ppdev lp parport efi_pstore drm(OE) ip_tables x_tables pci_stub crc32_pclmul nvme ahci libahci i2c_piix4 r8169 nvme_core i2c_designware_pci realtek i2c_ccgx_ucsi video wmi hid_generic cdc_ether usbnet usbhid hid r8152 mii [ 150.834354] CPU: 8 PID: 1812 Comm: rocrtst64 Tainted: G OE 6.10.0-custom #492 [ 150.834358] Hardware name: AMD Majolica-RN/Majolica-RN, BIOS RMJ1009A 06/13/2021 [ 150.834360] RIP: 0010:check_unmap+0x1cc/0x930 [ 150.834363] Code: c0 4c 89 4d c8 e8 34 bf 86 00 4c 8b 4d c8 4c 8b 45 c0 48 8b 4d b8 48 89 c6 41 57 4c 89 ea 48 c7 c7 80 49 b4 84 e8 b4 81 f3 ff <0f> 0b 48 c7 c7 04 83 ac 84 e8 76 ba fc ff 41 8b 76 4c 49 8d 7e 50 [ 150.834365] RSP: 0018:ffffaac5023739e0 EFLAGS: 00010086 [ 150.834368] RAX: 0000000000000000 RBX: ffffffff8566a2e0 RCX: 0000000000000027 [ 150.834370] RDX: ffff8f6a8f621688 RSI: 0000000000000001 RDI: ffff8f6a8f621680 [ 150.834372] RBP: ffffaac502373a30 R08: 00000000000000c9 R09: ffffaac502373850 [ 150.834373] R10: ffffaac502373848 R11: ffffffff84f46328 R12: ffffaac502373a40 [ 150.834375] R13: ffff8f6741045330 R14: ffff8f6741a77700 R15: ffffffff84ac831b [ 150.834377] FS: 00007faf0fc94c00(0000) GS:ffff8f6a8f600000(0000) knlGS:0000000000000000 [ 150.834379] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 150.834381] CR2: 00007faf0b600020 CR3: 000000010a52e000 CR4: 0000000000350ef0 [ 150.834383] Call Trace: [ 150.834385] [ 150.834387] ? show_regs+0x6d/0x80 [ 150.834393] ? __warn+0x8c/0x140 [ 150.834397] ? check_unmap+0x1cc/0x930 [ 150.834400] ? report_bug+0x193/0x1a0 [ 150.834406] ? handle_bug+0x46/0x80 [ 150.834410] ? exc_invalid_op+0x1d/0x80 [ 150.834413] ? asm_exc_invalid_op+0x1f/0x30 [ 150.834420] ? check_unmap+0x1cc/0x930 [ 150.834425] debug_dma_unmap_page+0x86/0x90 [ 150.834431] ? srso_return_thunk+0x5/0x5f [ 150.834435] ? rmap_walk+0x28/0x50 [ 150.834438] ? srso_return_thunk+0x5/0x5f [ 150.834441] ? remove_migration_ptes+0x79/0x80 [ 150.834445] ? srso_return_thunk+0x5/0x5f [ 150.834448] dma_unmap_page_attrs+0xfa/0x1d0 [ 150.834453] svm_range_dma_unmap_dev+0x8a/0xf0 [amdgpu] [ 150.834710] svm_migrate_ram_to_vram+0x361/0x740 [amdgpu] [ 150.834914] svm_migrate_to_vram+0xa8/0xe0 [amdgpu] [ 150.835111] svm_range_set_attr+0xff2/0x1450 [amdgpu] [ 150.835311] svm_ioctl+0x4a/0x50 [amdgpu] [ 150.835510] kfd_ioctl_svm+0x54/0x90 [amdgpu] [ 150.835701] kfd_ioctl+0x3c2/0x530 [amdgpu] [ 150.835888] ? __pfx_kfd_ioctl_svm+0x10/0x10 [amdgpu] [ 150.836075] ? srso_return_thunk+0x5/0x5f [ 150.836080] ? tomoyo_file_ioctl+0x20/0x30 [ 150.836086] __x64_sys_ioctl+0x9c/0xd0 [ 150.836091] x64_sys_call+0x1219/0x20d0 [ 150.836095] do_syscall_64+0x51/0x120 [ 150.836098] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 150.836102] RIP: 0033:0x7faf0f11a94f [ 150.836105] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00 [ 150.836107] RSP: 002b:00007ffeced26bc0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 150.836110] RAX: ffffffffffffffda RBX: 000055c683528fb0 RCX: 00007faf0f11a94f [ 150.836112] RDX: 00007ffeced26c60 RSI: 00000000c0484b20 RDI: 0000000000000003 [ 150.836114] RBP: 00007ffeced26c50 R08: 0000000000000000 R09: 0000000000000001 [ 150.836115] R10: 0000000000000032 R11: 0000000000000246 R12: 000055c683528bd0 [ 150.836117] R13: 0000000000000000 R14: 0000000000000021 R15: 0000000000000000 [ 150.836122] [ 150.836124] ---[ end trace 0000000000000000 ]--- Signed-off-by: Prike Liang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index eacfeb32f35d6..4b275937d05e9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -306,7 +306,7 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, spage = migrate_pfn_to_page(migrate->src[i]); if (spage && !is_zone_device_page(spage)) { src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE, - DMA_TO_DEVICE); + DMA_BIDIRECTIONAL); r = dma_mapping_error(dev, src[i]); if (r) { dev_err(dev, "%s: fail %d dma_map_page\n", @@ -629,7 +629,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } - dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE); + dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); r = dma_mapping_error(dev, dst[i]); if (r) { dev_err(adev->dev, "%s: fail %d dma_map_page\n", __func__, r); From c33aea446bf555ab2b4e06deb914ba8f87cdb068 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 5 Dec 2024 15:15:29 -0800 Subject: [PATCH 052/330] perf tools: Fix precise_ip fallback logic Sometimes it returns other than EOPNOTSUPP for invalid precise_ip so it cannot check the error code. Let's move the fallback after the missing feature checks so that it can handle EINVAL as well. This also aligns well with the existing behavior which blindly turns off the precise_ip but we check the missing features correctly now. Fixes: af954f76eea56453 ("perf tools: Check fallback error and order") Reported-by: kernel test robot Reported-by: Arnaldo Carvalho de Melo Closes: https://lore.kernel.org/oe-lkp/202411301431.799e5531-lkp@intel.com Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/Z1DV0lN8qHSysX7f@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/evsel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f745723d486ba..d22c5df1701ec 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2571,12 +2571,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, if (err == -EMFILE && rlimit__increase_nofile(&set_rlimit)) goto retry_open; - if (err == -EOPNOTSUPP && evsel__precise_ip_fallback(evsel)) - goto retry_open; - if (err == -EINVAL && evsel__detect_missing_features(evsel)) goto fallback_missing_features; + if (evsel__precise_ip_fallback(evsel)) + goto retry_open; + if (evsel__handle_error_quirks(evsel, err)) goto retry_open; From 4e450dfd0f968b79204637bf13280892dff287b1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 5 Dec 2024 06:30:33 -0800 Subject: [PATCH 053/330] tty: serial: Work around warning backtrace in serial8250_set_defaults Commit 7c7e6c8924e7 ("tty: serial: handle HAS_IOPORT dependencies") triggers warning backtraces on a number of platforms which don't support IO ports. WARNING: CPU: 0 PID: 0 at drivers/tty/serial/8250/8250_port.c:470 serial8250_set_defaults+0x148/0x1d8 Unsupported UART type 0 The problem is seen because serial8250_set_defaults() is called for all members of the serial8250_ports[] array even if that array is not initialized. Work around the problem by only displaying the warning if the port type is not 0 (UPIO_PORT) or if iobase is set for the port. Fixes: 7c7e6c8924e7 ("tty: serial: handle HAS_IOPORT dependencies") Acked-by: Arnd Bergmann Cc: Niklas Schnelle Signed-off-by: Guenter Roeck Tested-by: Stafford Horne Link: https://lore.kernel.org/r/20241205143033.2695333-1-linux@roeck-us.net Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 4d63d80e78a92..649e74e9b52f6 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -467,7 +467,8 @@ static void set_io_from_upio(struct uart_port *p) break; #endif default: - WARN(1, "Unsupported UART type %x\n", p->iotype); + WARN(p->iotype != UPIO_PORT || p->iobase, + "Unsupported UART type %x\n", p->iotype); p->serial_in = no_serial_in; p->serial_out = no_serial_out; } From 47b17ba05a463b22fa79f132e6f6899d53538802 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 6 Dec 2024 10:57:57 +0000 Subject: [PATCH 054/330] ALSA: hda: cs35l56: Remove calls to cs35l56_force_sync_asp1_registers_from_cache() Commit 5d7e328e20b3 ("ASoC: cs35l56: Revert support for dual-ownership of ASP registers") replaced cs35l56_force_sync_asp1_registers_from_cache() with a dummy implementation so that the HDA driver would continue to build. Remove the calls from HDA and remove the stub function. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241206105757.718750-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- include/sound/cs35l56.h | 6 ------ sound/pci/hda/cs35l56_hda.c | 8 -------- 2 files changed, 14 deletions(-) diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 94e8185c4795f..3dc7a1551ac35 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -271,12 +271,6 @@ struct cs35l56_base { struct gpio_desc *reset_gpio; }; -/* Temporary to avoid a build break with the HDA driver */ -static inline int cs35l56_force_sync_asp1_registers_from_cache(struct cs35l56_base *cs35l56_base) -{ - return 0; -} - static inline bool cs35l56_is_otp_register(unsigned int reg) { return (reg >> 16) == 3; diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index e3ac0e23ae321..7baf3b506eefe 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -151,10 +151,6 @@ static int cs35l56_hda_runtime_resume(struct device *dev) } } - ret = cs35l56_force_sync_asp1_registers_from_cache(&cs35l56->base); - if (ret) - goto err; - return 0; err: @@ -1059,9 +1055,6 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) regmap_multi_reg_write(cs35l56->base.regmap, cs35l56_hda_dai_config, ARRAY_SIZE(cs35l56_hda_dai_config)); - ret = cs35l56_force_sync_asp1_registers_from_cache(&cs35l56->base); - if (ret) - goto dsp_err; /* * By default only enable one ASP1TXn, where n=amplifier index, @@ -1087,7 +1080,6 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) pm_err: pm_runtime_disable(cs35l56->base.dev); -dsp_err: cs_dsp_remove(&cs35l56->cs_dsp); err: gpiod_set_value_cansleep(cs35l56->base.reset_gpio, 0); From 5a69e3d0a1b0f07e58c353560cfcb1ea20a6f040 Mon Sep 17 00:00:00 2001 From: Hridesh MG Date: Thu, 5 Dec 2024 22:48:42 +0530 Subject: [PATCH 055/330] ALSA: hda/realtek: Fix headset mic on Acer Nitro 5 Add a PCI quirk to enable microphone input on the headphone jack on the Acer Nitro 5 AN515-58 laptop. Signed-off-by: Hridesh MG Cc: Link: https://patch.msgid.link/20241205171843.7787-1-hridesh699@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4e58c6810f221..767c7176ead65 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10142,6 +10142,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x1430, "Acer TravelMate B311R-31", ALC256_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1466, "Acer Aspire A515-56", ALC255_FIXUP_ACER_HEADPHONE_AND_MIC), SND_PCI_QUIRK(0x1025, 0x1534, "Acer Predator PH315-54", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1025, 0x159c, "Acer Nitro 5 AN515-58", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x169a, "Acer Swift SFG16", ALC256_FIXUP_ACER_SFG16_MICMUTE_LED), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), SND_PCI_QUIRK(0x1028, 0x053c, "Dell Latitude E5430", ALC292_FIXUP_DELL_E7X), From 1b452c2de5555d832cd51c46824272a44ad7acac Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Fri, 6 Dec 2024 00:03:06 +0300 Subject: [PATCH 056/330] ALSA: hda/realtek - Add support for ASUS Zen AIO 27 Z272SD_A272SD audio Introduces necessary quirks to enable audio functionality on the ASUS Zen AIO 27 Z272SD_A272SD: - configures verbs to activate internal speakers and headphone jack. - implements adjustments for headset microphone functionality. The speaker and jack configurations were derived from a dump of the working Windows driver, while the headset microphone functionality was fine-tuned through experimental testing. Link: https://lore.kernel.org/all/CAGGMHBOGDUnMewBTrZgoBKFk_A4sNF4fEXwfH9Ay8SNTzjy0-g@mail.gmail.com/T/ Signed-off-by: Vasiliy Kovalev Link: https://patch.msgid.link/20241205210306.977634-1-kovalev@altlinux.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 767c7176ead65..f2686a4577d7c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7714,6 +7714,7 @@ enum { ALC274_FIXUP_HP_MIC, ALC274_FIXUP_HP_HEADSET_MIC, ALC274_FIXUP_HP_ENVY_GPIO, + ALC274_FIXUP_ASUS_ZEN_AIO_27, ALC256_FIXUP_ASUS_HPE, ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK, ALC287_FIXUP_HP_GPIO_LED, @@ -9516,6 +9517,26 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc274_fixup_hp_envy_gpio, }, + [ALC274_FIXUP_ASUS_ZEN_AIO_27] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x10 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xc420 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x40 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x8800 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x49 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0249 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x4a }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x202b }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x62 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0xa007 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x6b }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x5060 }, + {} + }, + .chained = true, + .chain_id = ALC2XX_FIXUP_HEADSET_MIC, + }, [ALC256_FIXUP_ASUS_HPE] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -10631,6 +10652,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1f62, "ASUS UX7602ZM", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1f92, "ASUS ROG Flow X16", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), + SND_PCI_QUIRK(0x1043, 0x31d0, "ASUS Zen AIO 27 Z272SD_A272SD", ALC274_FIXUP_ASUS_ZEN_AIO_27), SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), From ea6398a5af81e3e7fb3da5d261694d479a321fd9 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 27 Nov 2024 04:18:40 +0000 Subject: [PATCH 057/330] RISC-V: KVM: Fix csr_write -> csr_set for HVIEN PMU overflow bit This doesn't cause a problem currently as HVIEN isn't used elsewhere yet. Found by inspection. Signed-off-by: Michael Neuling Fixes: 16b0bde9a37c ("RISC-V: KVM: Add perf sampling support for guests") Reviewed-by: Atish Patra Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20241127041840.419940-1-michaelneuling@tenstorrent.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index dcced4db7fe8c..19afd1f235377 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -590,7 +590,7 @@ void kvm_riscv_aia_enable(void) csr_set(CSR_HIE, BIT(IRQ_S_GEXT)); /* Enable IRQ filtering for overflow interrupt only if sscofpmf is present */ if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) - csr_write(CSR_HVIEN, BIT(IRQ_PMU_OVF)); + csr_set(CSR_HVIEN, BIT(IRQ_PMU_OVF)); } void kvm_riscv_aia_disable(void) From 0b2c29fb68f8bf3e87a9d88404aa6fdd486223e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 6 Dec 2024 11:41:40 +0100 Subject: [PATCH 058/330] efi/zboot: Limit compression options to GZIP and ZSTD For historical reasons, the legacy decompressor code on various architectures supports 7 different compression types for the compressed kernel image. EFI zboot is not a compression library museum, and so the options can be limited to what is likely to be useful in practice: - GZIP is tried and tested, and is still one of the fastest at decompression time, although the compression ratio is not very high; moreover, Fedora is already shipping EFI zboot kernels for arm64 that use GZIP, and QEMU implements direct support for it when booting a kernel without firmware loaded; - ZSTD has a very high compression ratio (although not the highest), and is almost as fast as GZIP at decompression time. Reducing the number of options makes it less of a hassle for other consumers of the EFI zboot format (such as QEMU today, and kexec in the future) to support it transparently without having to carry 7 different decompression libraries. Acked-by: Gerd Hoffmann Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 4 ---- drivers/firmware/efi/libstub/Makefile.zboot | 18 ++++++------------ 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index e312d731f4a36..5fe61b9ab5f96 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -76,10 +76,6 @@ config EFI_ZBOOT bool "Enable the generic EFI decompressor" depends on EFI_GENERIC_STUB && !ARM select HAVE_KERNEL_GZIP - select HAVE_KERNEL_LZ4 - select HAVE_KERNEL_LZMA - select HAVE_KERNEL_LZO - select HAVE_KERNEL_XZ select HAVE_KERNEL_ZSTD help Create the bootable image as an EFI application that carries the diff --git a/drivers/firmware/efi/libstub/Makefile.zboot b/drivers/firmware/efi/libstub/Makefile.zboot index 65ffd0b760b2f..48842b5c106b8 100644 --- a/drivers/firmware/efi/libstub/Makefile.zboot +++ b/drivers/firmware/efi/libstub/Makefile.zboot @@ -12,22 +12,16 @@ quiet_cmd_copy_and_pad = PAD $@ $(obj)/vmlinux.bin: $(obj)/$(EFI_ZBOOT_PAYLOAD) FORCE $(call if_changed,copy_and_pad) -comp-type-$(CONFIG_KERNEL_GZIP) := gzip -comp-type-$(CONFIG_KERNEL_LZ4) := lz4 -comp-type-$(CONFIG_KERNEL_LZMA) := lzma -comp-type-$(CONFIG_KERNEL_LZO) := lzo -comp-type-$(CONFIG_KERNEL_XZ) := xzkern -comp-type-$(CONFIG_KERNEL_ZSTD) := zstd22 - # in GZIP, the appended le32 carrying the uncompressed size is part of the # format, but in other cases, we just append it at the end for convenience, # causing the original tools to complain when checking image integrity. -# So disregard it when calculating the payload size in the zimage header. -zboot-method-y := $(comp-type-y)_with_size -zboot-size-len-y := 4 +comp-type-y := gzip +zboot-method-y := gzip +zboot-size-len-y := 0 -zboot-method-$(CONFIG_KERNEL_GZIP) := gzip -zboot-size-len-$(CONFIG_KERNEL_GZIP) := 0 +comp-type-$(CONFIG_KERNEL_ZSTD) := zstd +zboot-method-$(CONFIG_KERNEL_ZSTD) := zstd22_with_size +zboot-size-len-$(CONFIG_KERNEL_ZSTD) := 4 $(obj)/vmlinuz: $(obj)/vmlinux.bin FORCE $(call if_changed,$(zboot-method-y)) From 3396995f9fb6bcbe0004a68118a22f98bab6e2b9 Mon Sep 17 00:00:00 2001 From: Haoyu Li Date: Tue, 3 Dec 2024 22:14:51 +0800 Subject: [PATCH 059/330] gpio: ljca: Initialize num before accessing item in ljca_gpio_config With the new __counted_by annocation in ljca_gpio_packet, the "num" struct member must be set before accessing the "item" array. Failing to do so will trigger a runtime warning when enabling CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Fixes: 1034cc423f1b ("gpio: update Intel LJCA USB GPIO driver") Cc: stable@vger.kernel.org Signed-off-by: Haoyu Li Link: https://lore.kernel.org/stable/20241203141451.342316-1-lihaoyu499%40gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ljca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-ljca.c b/drivers/gpio/gpio-ljca.c index d67b912d884dd..c6c31e6146c76 100644 --- a/drivers/gpio/gpio-ljca.c +++ b/drivers/gpio/gpio-ljca.c @@ -82,9 +82,9 @@ static int ljca_gpio_config(struct ljca_gpio_dev *ljca_gpio, u8 gpio_id, int ret; mutex_lock(&ljca_gpio->trans_lock); + packet->num = 1; packet->item[0].index = gpio_id; packet->item[0].value = config | ljca_gpio->connect_mode[gpio_id]; - packet->num = 1; ret = ljca_transfer(ljca_gpio->ljca, LJCA_GPIO_CONFIG, (u8 *)packet, struct_size(packet, item, packet->num), NULL, 0); From 82fdcf9b518b205da040046fbe7747fb3fd18657 Mon Sep 17 00:00:00 2001 From: Jaakko Salo Date: Fri, 6 Dec 2024 18:44:48 +0200 Subject: [PATCH 060/330] ALSA: usb-audio: Add implicit feedback quirk for Yamaha THR5 Use implicit feedback from the capture endpoint to fix popping sounds during playback. Link: https://bugzilla.kernel.org/show_bug.cgi?id=219567 Signed-off-by: Jaakko Salo Cc: Link: https://patch.msgid.link/20241206164448.8136-1-jaakkos@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 00101875d9a8d..a0767de7f1b7e 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2179,6 +2179,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x09a4, /* Logitech QuickCam E 3500 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_IGNORE_CTL_ERROR), + DEVICE_FLG(0x0499, 0x1506, /* Yamaha THR5 */ + QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0499, 0x1509, /* Steinberg UR22 */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0499, 0x3108, /* Yamaha YIT-W12TX */ From 676fe1f6f74db988191dab5df3bf256908177072 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Thu, 5 Dec 2024 19:30:14 +0900 Subject: [PATCH 061/330] ata: sata_highbank: fix OF node reference leak in highbank_initialize_phys() The OF node reference obtained by of_parse_phandle_with_args() is not released on early return. Add a of_node_put() call before returning. Fixes: 8996b89d6bc9 ("ata: add platform driver for Calxeda AHCI controller") Signed-off-by: Joe Hattori Signed-off-by: Damien Le Moal --- drivers/ata/sata_highbank.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c index b1b40e9551ded..c8c817c51230a 100644 --- a/drivers/ata/sata_highbank.c +++ b/drivers/ata/sata_highbank.c @@ -348,6 +348,7 @@ static int highbank_initialize_phys(struct device *dev, void __iomem *addr) phy_nodes[phy] = phy_data.np; cphy_base[phy] = of_iomap(phy_nodes[phy], 0); if (cphy_base[phy] == NULL) { + of_node_put(phy_data.np); return 0; } phy_count += 1; From 7c005292e20ac53dfa601bf2a7375fd4815511ad Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 7 Dec 2024 14:37:53 +0100 Subject: [PATCH 062/330] ALSA: hda/ca0132: Use standard HD-audio quirk matching helpers CA0132 used the PCI SSID lookup helper that doesn't support the model string matching or quirk aliasing. Replace it with the standard HD-audio quirk helpers for supporting those, and add the definition of the model strings for supported quirks, too. There should be no visible change to the outside for the working system, but the driver will parse the model option and apply the quirk based on it from now on. Link: https://patch.msgid.link/20241207133754.3658-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_ca0132.c | 37 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index e4673a71551a3..d40197fb5fbd5 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -1134,7 +1134,6 @@ struct ca0132_spec { struct hda_codec *codec; struct delayed_work unsol_hp_work; - int quirk; #ifdef ENABLE_TUNING_CONTROLS long cur_ctl_vals[TUNING_CTLS_COUNT]; @@ -1166,7 +1165,6 @@ struct ca0132_spec { * CA0132 quirks table */ enum { - QUIRK_NONE, QUIRK_ALIENWARE, QUIRK_ALIENWARE_M17XR4, QUIRK_SBZ, @@ -1176,10 +1174,11 @@ enum { QUIRK_R3D, QUIRK_AE5, QUIRK_AE7, + QUIRK_NONE = HDA_FIXUP_ID_NOT_SET, }; #ifdef CONFIG_PCI -#define ca0132_quirk(spec) ((spec)->quirk) +#define ca0132_quirk(spec) ((spec)->codec->fixup_id) #define ca0132_use_pci_mmio(spec) ((spec)->use_pci_mmio) #define ca0132_use_alt_functions(spec) ((spec)->use_alt_functions) #define ca0132_use_alt_controls(spec) ((spec)->use_alt_controls) @@ -1293,7 +1292,7 @@ static const struct hda_pintbl ae7_pincfgs[] = { {} }; -static const struct snd_pci_quirk ca0132_quirks[] = { +static const struct hda_quirk ca0132_quirks[] = { SND_PCI_QUIRK(0x1028, 0x057b, "Alienware M17x R4", QUIRK_ALIENWARE_M17XR4), SND_PCI_QUIRK(0x1028, 0x0685, "Alienware 15 2015", QUIRK_ALIENWARE), SND_PCI_QUIRK(0x1028, 0x0688, "Alienware 17 2015", QUIRK_ALIENWARE), @@ -1316,6 +1315,19 @@ static const struct snd_pci_quirk ca0132_quirks[] = { {} }; +static const struct hda_model_fixup ca0132_quirk_models[] = { + { .id = QUIRK_ALIENWARE, .name = "alienware" }, + { .id = QUIRK_ALIENWARE_M17XR4, .name = "alienware-m17xr4" }, + { .id = QUIRK_SBZ, .name = "sbz" }, + { .id = QUIRK_ZXR, .name = "zxr" }, + { .id = QUIRK_ZXR_DBPRO, .name = "zxr-dbpro" }, + { .id = QUIRK_R3DI, .name = "r3di" }, + { .id = QUIRK_R3D, .name = "r3d" }, + { .id = QUIRK_AE5, .name = "ae5" }, + { .id = QUIRK_AE7, .name = "ae7" }, + {} +}; + /* Output selection quirk info structures. */ #define MAX_QUIRK_MMIO_GPIO_SET_VALS 3 #define MAX_QUIRK_SCP_SET_VALS 2 @@ -9957,17 +9969,15 @@ static int ca0132_prepare_verbs(struct hda_codec *codec) */ static void sbz_detect_quirk(struct hda_codec *codec) { - struct ca0132_spec *spec = codec->spec; - switch (codec->core.subsystem_id) { case 0x11020033: - spec->quirk = QUIRK_ZXR; + codec->fixup_id = QUIRK_ZXR; break; case 0x1102003f: - spec->quirk = QUIRK_ZXR_DBPRO; + codec->fixup_id = QUIRK_ZXR_DBPRO; break; default: - spec->quirk = QUIRK_SBZ; + codec->fixup_id = QUIRK_SBZ; break; } } @@ -9976,7 +9986,6 @@ static int patch_ca0132(struct hda_codec *codec) { struct ca0132_spec *spec; int err; - const struct snd_pci_quirk *quirk; codec_dbg(codec, "patch_ca0132\n"); @@ -9987,11 +9996,7 @@ static int patch_ca0132(struct hda_codec *codec) spec->codec = codec; /* Detect codec quirk */ - quirk = snd_pci_quirk_lookup(codec->bus->pci, ca0132_quirks); - if (quirk) - spec->quirk = quirk->value; - else - spec->quirk = QUIRK_NONE; + snd_hda_pick_fixup(codec, ca0132_quirk_models, ca0132_quirks, NULL); if (ca0132_quirk(spec) == QUIRK_SBZ) sbz_detect_quirk(codec); @@ -10068,7 +10073,7 @@ static int patch_ca0132(struct hda_codec *codec) spec->mem_base = pci_iomap(codec->bus->pci, 2, 0xC20); if (spec->mem_base == NULL) { codec_warn(codec, "pci_iomap failed! Setting quirk to QUIRK_NONE."); - spec->quirk = QUIRK_NONE; + codec->fixup_id = QUIRK_NONE; } } #endif From 50db91fccea0da5c669bc68e2429e8de303758d3 Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Sat, 7 Dec 2024 23:18:36 +0300 Subject: [PATCH 063/330] ALSA: hda/realtek: Add new alc2xx-fixup-headset-mic model Introduces the alc2xx-fixup-headset-mic model to simplify enabling headset microphones on ALC2XX codecs. Many recent configurations, as well as older systems that lacked this fix for a long time, leave headset microphones inactive by default. This addition provides a flexible workaround using the existing ALC2XX_FIXUP_HEADSET_MIC quirk. Signed-off-by: Vasiliy Kovalev Link: https://patch.msgid.link/20241207201836.6879-1-kovalev@altlinux.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f2686a4577d7c..ed0146e256369 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11200,6 +11200,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC255_FIXUP_ACER_HEADPHONE_AND_MIC, .name = "alc255-acer-headphone-and-mic"}, {.id = ALC285_FIXUP_HP_GPIO_AMP_INIT, .name = "alc285-hp-amp-init"}, {.id = ALC236_FIXUP_LENOVO_INV_DMIC, .name = "alc236-fixup-lenovo-inv-mic"}, + {.id = ALC2XX_FIXUP_HEADSET_MIC, .name = "alc2xx-fixup-headset-mic"}, {} }; #define ALC225_STANDARD_PINS \ From 2ab0837cb91b7de507daa145d17b3b6b2efb3abf Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 8 Dec 2024 13:34:13 -0500 Subject: [PATCH 064/330] efivarfs: Fix error on non-existent file When looking up a non-existent file, efivarfs returns -EINVAL if the file does not conform to the NAME-GUID format and -ENOENT if it does. This is caused by efivars_d_hash() returning -EINVAL if the name is not formatted correctly. This error is returned before simple_lookup() returns a negative dentry, and is the error value that the user sees. Fix by removing this check. If the file does not exist, simple_lookup() will return a negative dentry leading to -ENOENT and efivarfs_create() already has a validity check before it creates an entry (and will correctly return -EINVAL) Signed-off-by: James Bottomley Cc: [ardb: make efivarfs_valid_name() static] Signed-off-by: Ard Biesheuvel --- fs/efivarfs/inode.c | 2 +- fs/efivarfs/internal.h | 1 - fs/efivarfs/super.c | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 586446e02ef72..ec23da8405ff8 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -51,7 +51,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, * * VariableName-12345678-1234-1234-1234-1234567891bc */ -bool efivarfs_valid_name(const char *str, int len) +static bool efivarfs_valid_name(const char *str, int len) { const char *s = str + len - EFI_VARIABLE_GUID_LEN; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index d71d2e08422f0..74f0602a9e016 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -60,7 +60,6 @@ bool efivar_variable_is_removable(efi_guid_t vendor, const char *name, extern const struct file_operations efivarfs_file_operations; extern const struct inode_operations efivarfs_dir_inode_operations; -extern bool efivarfs_valid_name(const char *str, int len); extern struct inode *efivarfs_get_inode(struct super_block *sb, const struct inode *dir, int mode, dev_t dev, bool is_removable); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a929f1b613be8..beba15673be8d 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -144,9 +144,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) const unsigned char *s = qstr->name; unsigned int len = qstr->len; - if (!efivarfs_valid_name(s, len)) - return -EINVAL; - while (len-- > EFI_VARIABLE_GUID_LEN) hash = partial_name_hash(*s++, hash); From c84bd6c810d1880194fea2229c7086e4b73fddc1 Mon Sep 17 00:00:00 2001 From: Adrian Ratiu Date: Mon, 9 Dec 2024 11:05:28 +0200 Subject: [PATCH 065/330] sound: usb: enable DSD output for ddHiFi TC44C This is a UAC 2 DAC capable of raw DSD on intf 2 alt 4: Bus 007 Device 004: ID 262a:9302 SAVITECH Corp. TC44C Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 239 Miscellaneous Device bDeviceSubClass 2 [unknown] bDeviceProtocol 1 Interface Association bMaxPacketSize0 64 idVendor 0x262a SAVITECH Corp. idProduct 0x9302 TC44C bcdDevice 0.01 iManufacturer 1 DDHIFI iProduct 2 TC44C iSerial 6 5000000001 ....... Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 2 bAlternateSetting 4 bNumEndpoints 2 bInterfaceClass 1 Audio bInterfaceSubClass 2 Streaming bInterfaceProtocol 32 iInterface 0 AudioStreaming Interface Descriptor: bLength 16 bDescriptorType 36 bDescriptorSubtype 1 (AS_GENERAL) bTerminalLink 3 bmControls 0x00 bFormatType 1 bmFormats 0x80000000 bNrChannels 2 bmChannelConfig 0x00000000 iChannelNames 0 ....... Signed-off-by: Adrian Ratiu Link: https://patch.msgid.link/20241209090529.16134-1-adrian.ratiu@collabora.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a0767de7f1b7e..8ba0aff8be2ec 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2325,6 +2325,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x2522, 0x0007, /* LH Labs Geek Out HD Audio 1V5 */ QUIRK_FLAG_SET_IFACE_FIRST), + DEVICE_FLG(0x262a, 0x9302, /* ddHiFi TC44C */ + QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x2708, 0x0002, /* Audient iD14 */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x2912, 0x30c8, /* Audioengine D1 */ From b50a3e98442b8d72f061617c7f7a71f7dba19484 Mon Sep 17 00:00:00 2001 From: Adrian Ratiu Date: Mon, 9 Dec 2024 11:05:29 +0200 Subject: [PATCH 066/330] sound: usb: format: don't warn that raw DSD is unsupported UAC 2 & 3 DAC's set bit 31 of the format to signal support for a RAW_DATA type, typically used for DSD playback. This is correctly tested by (format & UAC*_FORMAT_TYPE_I_RAW_DATA), fp->dsd_raw = true; and call snd_usb_interface_dsd_format_quirks(), however a confusing and unnecessary message gets printed because the bit is not properly tested in the last "unsupported" if test: if (format & ~0x3F) { ... } For example the output: usb 7-1: new high-speed USB device number 5 using xhci_hcd usb 7-1: New USB device found, idVendor=262a, idProduct=9302, bcdDevice=0.01 usb 7-1: New USB device strings: Mfr=1, Product=2, SerialNumber=6 usb 7-1: Product: TC44C usb 7-1: Manufacturer: TC44C usb 7-1: SerialNumber: 5000000001 hid-generic 0003:262A:9302.001E: No inputs registered, leaving hid-generic 0003:262A:9302.001E: hidraw6: USB HID v1.00 Device [DDHIFI TC44C] on usb-0000:08:00.3-1/input0 usb 7-1: 2:4 : unsupported format bits 0x100000000 This last "unsupported format" is actually wrong: we know the format is a RAW_DATA which we assume is DSD, so there is no need to print the confusing message. This we unset bit 31 of the format after recognizing it, to avoid the message. Suggested-by: Takashi Iwai Signed-off-by: Adrian Ratiu Link: https://patch.msgid.link/20241209090529.16134-2-adrian.ratiu@collabora.com Signed-off-by: Takashi Iwai --- sound/usb/format.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/usb/format.c b/sound/usb/format.c index 0cbf1d4fbe6ed..6049d957694ca 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -60,6 +60,8 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip, pcm_formats |= SNDRV_PCM_FMTBIT_SPECIAL; /* flag potentially raw DSD capable altsettings */ fp->dsd_raw = true; + /* clear special format bit to avoid "unsupported format" msg below */ + format &= ~UAC2_FORMAT_TYPE_I_RAW_DATA; } format <<= 1; @@ -71,8 +73,11 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip, sample_width = as->bBitResolution; sample_bytes = as->bSubslotSize; - if (format & UAC3_FORMAT_TYPE_I_RAW_DATA) + if (format & UAC3_FORMAT_TYPE_I_RAW_DATA) { pcm_formats |= SNDRV_PCM_FMTBIT_SPECIAL; + /* clear special format bit to avoid "unsupported format" msg below */ + format &= ~UAC3_FORMAT_TYPE_I_RAW_DATA; + } format <<= 1; break; From 70ec2e8be72c8cb71eb6a18f223484d2a39b708f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 20 Nov 2024 18:41:20 +0200 Subject: [PATCH 067/330] drm/i915/dsb: Don't use indexed register writes needlessly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out the DSB indexed register write command has rather significant initial overhead compared to the normal MMIO write command. Based on some quick experiments on TGL you have to write the register at least ~5 times for the indexed write command to come out ahead. If you write the register less times than that the MMIO write is faster. So it seems my automagic indexed write logic was a bit misguided. Go back to the original approach only use indexed writes for the cases we know will benefit from it (indexed LUT register updates). Currently we shouldn't have any cases where this truly matters (just some rare double writes to the precision LUT index registers), but we will need to switch the legacy LUT updates to write each LUT register twice (to avoid some palette anti-collision logic troubles). This would be close to the worst case for using indexed writes (two writes per register, and 256 separate registers). Using the MMIO write command should shave off around 30% of the execution time compared to using the indexed write command. Cc: stable@vger.kernel.org Fixes: 34d8311f4a1c ("drm/i915/dsb: Re-instate DSB for LUT updates") Fixes: 25ea3411bd23 ("drm/i915/dsb: Use non-posted register writes for legacy LUT") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20241120164123.12706-2-ville.syrjala@linux.intel.com Reviewed-by: Uma Shankar (cherry picked from commit ecba559a88ab8399a41893d7828caf4dccbeab6c) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_color.c | 51 +++++++++++++--------- drivers/gpu/drm/i915/display/intel_dsb.c | 19 ++++++-- drivers/gpu/drm/i915/display/intel_dsb.h | 2 + 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c index 174753625bcad..6ea3d5c58cb1b 100644 --- a/drivers/gpu/drm/i915/display/intel_color.c +++ b/drivers/gpu/drm/i915/display/intel_color.c @@ -1343,6 +1343,17 @@ static void ilk_lut_write(const struct intel_crtc_state *crtc_state, intel_de_write_fw(display, reg, val); } +static void ilk_lut_write_indexed(const struct intel_crtc_state *crtc_state, + i915_reg_t reg, u32 val) +{ + struct intel_display *display = to_intel_display(crtc_state); + + if (crtc_state->dsb_color_vblank) + intel_dsb_reg_write_indexed(crtc_state->dsb_color_vblank, reg, val); + else + intel_de_write_fw(display, reg, val); +} + static void ilk_load_lut_8(const struct intel_crtc_state *crtc_state, const struct drm_property_blob *blob) { @@ -1458,8 +1469,8 @@ static void bdw_load_lut_10(const struct intel_crtc_state *crtc_state, prec_index); for (i = 0; i < lut_size; i++) - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_10(&lut[i])); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_10(&lut[i])); /* * Reset the index, otherwise it prevents the legacy palette to be @@ -1612,16 +1623,16 @@ static void glk_load_degamma_lut(const struct intel_crtc_state *crtc_state, * ToDo: Extend to max 7.0. Enable 32 bit input value * as compared to just 16 to achieve this. */ - ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe), - DISPLAY_VER(display) >= 14 ? - mtl_degamma_lut(&lut[i]) : glk_degamma_lut(&lut[i])); + ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe), + DISPLAY_VER(display) >= 14 ? + mtl_degamma_lut(&lut[i]) : glk_degamma_lut(&lut[i])); } /* Clamp values > 1.0. */ while (i++ < glk_degamma_lut_size(display)) - ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe), - DISPLAY_VER(display) >= 14 ? - 1 << 24 : 1 << 16); + ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe), + DISPLAY_VER(display) >= 14 ? + 1 << 24 : 1 << 16); ilk_lut_write(crtc_state, PRE_CSC_GAMC_INDEX(pipe), 0); } @@ -1687,10 +1698,10 @@ icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state) for (i = 0; i < 9; i++) { const struct drm_color_lut *entry = &lut[i]; - ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), - ilk_lut_12p4_ldw(entry)); - ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), - ilk_lut_12p4_udw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), + ilk_lut_12p4_ldw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), + ilk_lut_12p4_udw(entry)); } ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_INDEX(pipe), @@ -1726,10 +1737,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state) for (i = 1; i < 257; i++) { entry = &lut[i * 8]; - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_ldw(entry)); - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_udw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_ldw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_udw(entry)); } /* @@ -1747,10 +1758,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state) for (i = 0; i < 256; i++) { entry = &lut[i * 8 * 128]; - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_ldw(entry)); - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_udw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_ldw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_udw(entry)); } ilk_lut_write(crtc_state, PREC_PAL_INDEX(pipe), diff --git a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c index b7b44399adaae..4d3785f5cb525 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.c +++ b/drivers/gpu/drm/i915/display/intel_dsb.c @@ -273,16 +273,20 @@ static bool intel_dsb_prev_ins_is_indexed_write(struct intel_dsb *dsb, i915_reg_ } /** - * intel_dsb_reg_write() - Emit register wriite to the DSB context + * intel_dsb_reg_write_indexed() - Emit register wriite to the DSB context * @dsb: DSB context * @reg: register address. * @val: value. * * This function is used for writing register-value pair in command * buffer of DSB. + * + * Note that indexed writes are slower than normal MMIO writes + * for a small number (less than 5 or so) of writes to the same + * register. */ -void intel_dsb_reg_write(struct intel_dsb *dsb, - i915_reg_t reg, u32 val) +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, + i915_reg_t reg, u32 val) { /* * For example the buffer will look like below for 3 dwords for auto @@ -340,6 +344,15 @@ void intel_dsb_reg_write(struct intel_dsb *dsb, } } +void intel_dsb_reg_write(struct intel_dsb *dsb, + i915_reg_t reg, u32 val) +{ + intel_dsb_emit(dsb, val, + (DSB_OPCODE_MMIO_WRITE << DSB_OPCODE_SHIFT) | + (DSB_BYTE_EN << DSB_BYTE_EN_SHIFT) | + i915_mmio_reg_offset(reg)); +} + static u32 intel_dsb_mask_to_byte_en(u32 mask) { return (!!(mask & 0xff000000) << 3 | diff --git a/drivers/gpu/drm/i915/display/intel_dsb.h b/drivers/gpu/drm/i915/display/intel_dsb.h index 33e0fc2ab3809..da6df07a3c839 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.h +++ b/drivers/gpu/drm/i915/display/intel_dsb.h @@ -34,6 +34,8 @@ void intel_dsb_finish(struct intel_dsb *dsb); void intel_dsb_cleanup(struct intel_dsb *dsb); void intel_dsb_reg_write(struct intel_dsb *dsb, i915_reg_t reg, u32 val); +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, + i915_reg_t reg, u32 val); void intel_dsb_reg_write_masked(struct intel_dsb *dsb, i915_reg_t reg, u32 mask, u32 val); void intel_dsb_noop(struct intel_dsb *dsb, int count); From cd3da567e2e46b8f75549637b960a83b024d6b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 20 Nov 2024 18:41:21 +0200 Subject: [PATCH 068/330] drm/i915/color: Stop using non-posted DSB writes for legacy LUT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DSB LUT register writes vs. palette anti-collision logic appear to interact in interesting ways: - posted DSB writes simply vanish into thin air while anti-collision is active - non-posted DSB writes actually get blocked by the anti-collision logic, but unfortunately this ends up hogging the bus for long enough that unrelated parallel CPU MMIO accesses start to disappear instead Even though we are updating the LUT during vblank we aren't immune to the anti-collision logic because it kicks in briefly for pipe prefill (initiated at frame start). The safe time window for performing the LUT update is thus between the undelayed vblank and frame start. Turns out that with low enough CDCLK frequency (DSB execution speed depends on CDCLK) we can exceed that. As we are currently using non-posted writes for the legacy LUT updates, in which case we can hit the far more severe failure mode. The problem is exacerbated by the fact that non-posted writes are much slower than posted writes (~4x it seems). To mititage the problem let's switch to using posted DSB writes for legacy LUT updates (which will involve using the double write approach to avoid other problems with DSB vs. legacy LUT writes). Despite writing each register twice this will in fact make the legacy LUT update faster when compared to the non-posted write approach, making the problem less likely to appear. The failure mode is also less severe. This isn't the 100% solution we need though. That will involve estimating how long the LUT update will take, and pushing frame start and/or delayed vblank forward to guarantee that the update will have finished by the time the pipe prefill starts... Cc: stable@vger.kernel.org Fixes: 34d8311f4a1c ("drm/i915/dsb: Re-instate DSB for LUT updates") Fixes: 25ea3411bd23 ("drm/i915/dsb: Use non-posted register writes for legacy LUT") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/12494 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20241120164123.12706-3-ville.syrjala@linux.intel.com Reviewed-by: Uma Shankar (cherry picked from commit 2504a316b35d49522f39cf0dc01830d7c36a9be4) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_color.c | 30 ++++++++++++++-------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c index 6ea3d5c58cb1b..7cd902bbd244e 100644 --- a/drivers/gpu/drm/i915/display/intel_color.c +++ b/drivers/gpu/drm/i915/display/intel_color.c @@ -1368,19 +1368,29 @@ static void ilk_load_lut_8(const struct intel_crtc_state *crtc_state, lut = blob->data; /* - * DSB fails to correctly load the legacy LUT - * unless we either write each entry twice, - * or use non-posted writes + * DSB fails to correctly load the legacy LUT unless + * we either write each entry twice when using posted + * writes, or we use non-posted writes. + * + * If palette anti-collision is active during LUT + * register writes: + * - posted writes simply get dropped and thus the LUT + * contents may not be correctly updated + * - non-posted writes are blocked and thus the LUT + * contents are always correct, but simultaneous CPU + * MMIO access will start to fail + * + * Choose the lesser of two evils and use posted writes. + * Using posted writes is also faster, even when having + * to write each register twice. */ - if (crtc_state->dsb_color_vblank) - intel_dsb_nonpost_start(crtc_state->dsb_color_vblank); - - for (i = 0; i < 256; i++) + for (i = 0; i < 256; i++) { ilk_lut_write(crtc_state, LGC_PALETTE(pipe, i), i9xx_lut_8(&lut[i])); - - if (crtc_state->dsb_color_vblank) - intel_dsb_nonpost_end(crtc_state->dsb_color_vblank); + if (crtc_state->dsb_color_vblank) + ilk_lut_write(crtc_state, LGC_PALETTE(pipe, i), + i9xx_lut_8(&lut[i])); + } } static void ilk_load_lut_10(const struct intel_crtc_state *crtc_state, From da0b986256ae9a78b0215214ff44f271bfe237c1 Mon Sep 17 00:00:00 2001 From: Eugene Kobyak Date: Tue, 3 Dec 2024 14:54:06 +0000 Subject: [PATCH 069/330] drm/i915: Fix NULL pointer dereference in capture_engine When the intel_context structure contains NULL, it raises a NULL pointer dereference error in drm_info(). Fixes: e8a3319c31a1 ("drm/i915: Allow error capture without a request") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/12309 Reviewed-by: Andi Shyti Cc: John Harrison Cc: # v6.3+ Signed-off-by: Eugene Kobyak Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/xmsgfynkhycw3cf56akp4he2ffg44vuratocsysaowbsnhutzi@augnqbm777at (cherry picked from commit 754302a5bc1bd8fd3b7d85c168b0a1af6d4bba4d) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_gpu_error.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 4eb58887819a1..71c0daef19962 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1643,9 +1643,21 @@ capture_engine(struct intel_engine_cs *engine, return NULL; intel_engine_get_hung_entity(engine, &ce, &rq); - if (rq && !i915_request_started(rq)) - drm_info(&engine->gt->i915->drm, "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", - engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); + if (rq && !i915_request_started(rq)) { + /* + * We want to know also what is the guc_id of the context, + * but if we don't have the context reference, then skip + * printing it. + */ + if (ce) + drm_info(&engine->gt->i915->drm, + "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", + engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); + else + drm_info(&engine->gt->i915->drm, + "Got hung context on %s with active request %lld:%lld not yet started\n", + engine->name, rq->fence.context, rq->fence.seqno); + } if (rq) { capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); From 2828e5808bcd5aae7fdcd169cac1efa2701fa2dd Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Wed, 27 Nov 2024 20:10:42 +0000 Subject: [PATCH 070/330] drm/i915: Fix memory leak by correcting cache object name in error handler Replace "slab_priorities" with "slab_dependencies" in the error handler to avoid memory leak. Fixes: 32eb6bcfdda9 ("drm/i915: Make request allocation caches global") Cc: # v5.2+ Signed-off-by: Jiasheng Jiang Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20241127201042.29620-1-jiashengjiangcool@gmail.com (cherry picked from commit 9bc5e7dc694d3112bbf0fa4c46ef0fa0f114937a) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_scheduler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index 762127dd56c53..70a854557e6ec 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -506,6 +506,6 @@ int __init i915_scheduler_module_init(void) return 0; err_priorities: - kmem_cache_destroy(slab_priorities); + kmem_cache_destroy(slab_dependencies); return -ENOMEM; } From 493afbd187c4c9cc1642792c0d9ba400c3d6d90d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 28 Nov 2024 12:59:54 +0530 Subject: [PATCH 071/330] sched/fair: Fix NEXT_BUDDY Adam reports that enabling NEXT_BUDDY insta triggers a WARN in pick_next_entity(). Moving clear_buddies() up before the delayed dequeue bits ensures no ->next buddy becomes delayed. Further ensure no new ->next buddy ever starts as delayed. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: Adam Li Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Adam Li Link: https://lkml.kernel.org/r/670a0d54-e398-4b1f-8a6e-90784e2fdf89@amd.com --- kernel/sched/fair.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 05b8f1eb2c149..9d7a2dd2c2606 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5478,6 +5478,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) bool sleep = flags & DEQUEUE_SLEEP; update_curr(cfs_rq); + clear_buddies(cfs_rq, se); if (flags & DEQUEUE_DELAYED) { SCHED_WARN_ON(!se->sched_delayed); @@ -5494,8 +5495,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { - if (cfs_rq->next == se) - cfs_rq->next = NULL; update_load_avg(cfs_rq, se, 0); se->sched_delayed = 1; return false; @@ -5520,8 +5519,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue_fair(cfs_rq, se, flags); - clear_buddies(cfs_rq, se); - update_entity_lag(cfs_rq, se); if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { se->deadline -= se->vruntime; @@ -8774,7 +8771,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { set_next_buddy(pse); } From c1f43c342e1f2e32f0620bf2e972e2a9ea0a1e60 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:45:56 +0100 Subject: [PATCH 072/330] sched/fair: Fix sched_can_stop_tick() for fair tasks We can't stop the tick of a rq if there are at least 2 tasks enqueued in the whole hierarchy and not only at the root cfs rq. rq->cfs.nr_running tracks the number of sched_entity at one level whereas rq->cfs.h_nr_running tracks all queued tasks in the hierarchy. Fixes: 11cc374f4643b ("sched_ext: Simplify scx_can_stop_tick() invocation in sched_can_stop_tick()") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-2-vincent.guittot@linaro.org --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c6d8232ad9eea..3e5a6bf587f91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1341,7 +1341,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.nr_running > 1) + if (rq->cfs.h_nr_running > 1) return false; /* From 76f2f783294d7d55c2564e2dfb0a7279ba0bc264 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Dec 2024 18:45:57 +0100 Subject: [PATCH 073/330] sched/eevdf: More PELT vs DELAYED_DEQUEUE Vincent and Dietmar noted that while commit fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") fixes the entity runnable stats, it does not adjust the cfs_rq runnable stats, which are based off of h_nr_running. Track h_nr_delayed such that we can discount those and adjust the signal. Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") Closes: https://lore.kernel.org/lkml/a9a45193-d0c6-4ba2-a822-464ad30b550e@arm.com/ Closes: https://lore.kernel.org/lkml/CAKfTPtCNUvWE_GX5LyvTF-WdxUT=ZgvZZv-4t=eWntg5uOFqiQ@mail.gmail.com/ [ Fixes checkpatch warnings and rebased ] Signed-off-by: Peter Zijlstra (Intel) Reported-by: Dietmar Eggemann Reported-by: Vincent Guittot Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20241202174606.4074512-3-vincent.guittot@linaro.org --- kernel/sched/debug.c | 1 + kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++----- kernel/sched/pelt.c | 2 +- kernel/sched/sched.h | 8 +++++-- 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a48b2a701ec27..a1be00a988bf6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -845,6 +845,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d7a2dd2c2606..97ee48c8bf5ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5465,9 +5465,33 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +static void set_delayed(struct sched_entity *se) +{ + se->sched_delayed = 1; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_delayed++; + if (cfs_rq_throttled(cfs_rq)) + break; + } +} + +static void clear_delayed(struct sched_entity *se) { se->sched_delayed = 0; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_delayed--; + if (cfs_rq_throttled(cfs_rq)) + break; + } +} + +static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +{ + clear_delayed(se); if (sched_feat(DELAY_ZERO) && se->vlag > 0) se->vlag = 0; } @@ -5496,7 +5520,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); - se->sched_delayed = 1; + set_delayed(se); return false; } } @@ -5908,7 +5932,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long task_delta, idle_task_delta, delayed_delta, dequeue = 1; long rq_h_nr_running = rq->cfs.h_nr_running; raw_spin_lock(&cfs_b->lock); @@ -5941,6 +5965,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; + delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5964,6 +5989,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_delayed -= delayed_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5986,6 +6012,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_delayed -= delayed_delta; } /* At this point se is NULL and we are at root level*/ @@ -6011,7 +6038,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta; + long task_delta, idle_task_delta, delayed_delta; long rq_h_nr_running = rq->cfs.h_nr_running; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6047,6 +6074,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; + delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -6064,6 +6092,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_delayed += delayed_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6081,6 +6110,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_delayed += delayed_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6934,7 +6964,7 @@ requeue_delayed_entity(struct sched_entity *se) } update_load_avg(cfs_rq, se, 0); - se->sched_delayed = 0; + clear_delayed(se); } /* @@ -6948,6 +6978,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int h_nr_delayed = 0; int task_new = !(flags & ENQUEUE_WAKEUP); int rq_h_nr_running = rq->cfs.h_nr_running; u64 slice = 0; @@ -6974,6 +7005,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + if (task_new) + h_nr_delayed = !!se->sched_delayed; + for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) @@ -6996,6 +7030,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_delayed += h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -7019,6 +7054,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_delayed += h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -7081,6 +7117,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) struct task_struct *p = NULL; int idle_h_nr_running = 0; int h_nr_running = 0; + int h_nr_delayed = 0; struct cfs_rq *cfs_rq; u64 slice = 0; @@ -7088,6 +7125,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) p = task_of(se); h_nr_running = 1; idle_h_nr_running = task_has_idle_policy(p); + if (!task_sleep && !task_delayed) + h_nr_delayed = !!se->sched_delayed; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7105,6 +7144,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = h_nr_running; @@ -7143,6 +7183,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = h_nr_running; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fc07382361a88..fee75cc2c47b6 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running, + cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 76f5f53a645fc..1e494af2cd23d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -649,6 +649,7 @@ struct cfs_rq { unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int h_nr_delayed; s64 avg_vruntime; u64 avg_load; @@ -898,8 +899,11 @@ struct dl_rq { static inline void se_update_runnable(struct sched_entity *se) { - if (!entity_is_task(se)) - se->runnable_weight = se->my_q->h_nr_running; + if (!entity_is_task(se)) { + struct cfs_rq *cfs_rq = se->my_q; + + se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; + } } static inline long se_runnable(struct sched_entity *se) From 8099b1f7e37e98f73664b883464d54e2e2d9522f Mon Sep 17 00:00:00 2001 From: Janaki Ramaiah Thota Date: Mon, 9 Dec 2024 16:04:52 +0530 Subject: [PATCH 074/330] regulator: dt-bindings: qcom,qca6390-pmu: document wcn6750-pmu Add description of the PMU node for the WCN6750B module. Signed-off-by: Janaki Ramaiah Thota Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241209103455.9675-2-quic_janathot@quicinc.com Signed-off-by: Mark Brown --- .../bindings/regulator/qcom,qca6390-pmu.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,qca6390-pmu.yaml b/Documentation/devicetree/bindings/regulator/qcom,qca6390-pmu.yaml index ca401a209cca7..47c425c9fff16 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,qca6390-pmu.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,qca6390-pmu.yaml @@ -18,6 +18,7 @@ properties: compatible: enum: - qcom,qca6390-pmu + - qcom,wcn6750-pmu - qcom,wcn6855-pmu - qcom,wcn7850-pmu @@ -27,6 +28,9 @@ properties: vddaon-supply: description: VDD_AON supply regulator handle + vddasd-supply: + description: VDD_ASD supply regulator handle + vdddig-supply: description: VDD_DIG supply regulator handle @@ -42,6 +46,9 @@ properties: vddio1p2-supply: description: VDD_IO_1P2 supply regulator handle + vddrfa0p8-supply: + description: VDD_RFA_0P8 supply regulator handle + vddrfa0p95-supply: description: VDD_RFA_0P95 supply regulator handle @@ -51,12 +58,18 @@ properties: vddrfa1p3-supply: description: VDD_RFA_1P3 supply regulator handle + vddrfa1p7-supply: + description: VDD_RFA_1P7 supply regulator handle + vddrfa1p8-supply: description: VDD_RFA_1P8 supply regulator handle vddrfa1p9-supply: description: VDD_RFA_1P9 supply regulator handle + vddrfa2p2-supply: + description: VDD_RFA_2P2 supply regulator handle + vddpcie1p3-supply: description: VDD_PCIE_1P3 supply regulator handle @@ -119,6 +132,20 @@ allOf: - vddpcie1p3-supply - vddpcie1p9-supply - vddio-supply + - if: + properties: + compatible: + contains: + const: qcom,wcn6750-pmu + then: + required: + - vddaon-supply + - vddasd-supply + - vddpmu-supply + - vddrfa0p8-supply + - vddrfa1p2-supply + - vddrfa1p7-supply + - vddrfa2p2-supply - if: properties: compatible: From f07ae52f5cf6a5584fdf7c8c652f027d90bc8b74 Mon Sep 17 00:00:00 2001 From: Philippe Simons Date: Sun, 8 Dec 2024 13:43:08 +0100 Subject: [PATCH 075/330] regulator: axp20x: AXP717: set ramp_delay AXP717 datasheet says that regulator ramp delay is 15.625 us/step, which is 10mV in our case. Add a AXP_DESC_RANGES_DELAY macro and update AXP_DESC_RANGES macro to expand to AXP_DESC_RANGES_DELAY with ramp_delay = 0 For DCDC4, steps is 100mv Add a AXP_DESC_DELAY macro and update AXP_DESC macro to expand to AXP_DESC_DELAY with ramp_delay = 0 This patch fix crashes when using CPU DVFS. Signed-off-by: Philippe Simons Tested-by: Hironori KIKUCHI Tested-by: Chris Morgan Reviewed-by: Chen-Yu Tsai Fixes: d2ac3df75c3a ("regulator: axp20x: add support for the AXP717") Link: https://patch.msgid.link/20241208124308.5630-1-simons.philippe@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 36 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index e3cc59b82ea61..dca99cfb7cbb8 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -371,8 +371,8 @@ .ops = &axp20x_ops, \ } -#define AXP_DESC(_family, _id, _match, _supply, _min, _max, _step, _vreg, \ - _vmask, _ereg, _emask) \ +#define AXP_DESC_DELAY(_family, _id, _match, _supply, _min, _max, _step, _vreg, \ + _vmask, _ereg, _emask, _ramp_delay) \ [_family##_##_id] = { \ .name = (_match), \ .supply_name = (_supply), \ @@ -388,9 +388,15 @@ .vsel_mask = (_vmask), \ .enable_reg = (_ereg), \ .enable_mask = (_emask), \ + .ramp_delay = (_ramp_delay), \ .ops = &axp20x_ops, \ } +#define AXP_DESC(_family, _id, _match, _supply, _min, _max, _step, _vreg, \ + _vmask, _ereg, _emask) \ + AXP_DESC_DELAY(_family, _id, _match, _supply, _min, _max, _step, _vreg, \ + _vmask, _ereg, _emask, 0) + #define AXP_DESC_SW(_family, _id, _match, _supply, _ereg, _emask) \ [_family##_##_id] = { \ .name = (_match), \ @@ -419,8 +425,8 @@ .ops = &axp20x_ops_fixed \ } -#define AXP_DESC_RANGES(_family, _id, _match, _supply, _ranges, _n_voltages, \ - _vreg, _vmask, _ereg, _emask) \ +#define AXP_DESC_RANGES_DELAY(_family, _id, _match, _supply, _ranges, _n_voltages, \ + _vreg, _vmask, _ereg, _emask, _ramp_delay) \ [_family##_##_id] = { \ .name = (_match), \ .supply_name = (_supply), \ @@ -436,9 +442,15 @@ .enable_mask = (_emask), \ .linear_ranges = (_ranges), \ .n_linear_ranges = ARRAY_SIZE(_ranges), \ + .ramp_delay = (_ramp_delay), \ .ops = &axp20x_ops_range, \ } +#define AXP_DESC_RANGES(_family, _id, _match, _supply, _ranges, _n_voltages, \ + _vreg, _vmask, _ereg, _emask) \ + AXP_DESC_RANGES_DELAY(_family, _id, _match, _supply, _ranges, \ + _n_voltages, _vreg, _vmask, _ereg, _emask, 0) + static const int axp209_dcdc2_ldo3_slew_rates[] = { 1600, 800, @@ -781,21 +793,21 @@ static const struct linear_range axp717_dcdc3_ranges[] = { }; static const struct regulator_desc axp717_regulators[] = { - AXP_DESC_RANGES(AXP717, DCDC1, "dcdc1", "vin1", + AXP_DESC_RANGES_DELAY(AXP717, DCDC1, "dcdc1", "vin1", axp717_dcdc1_ranges, AXP717_DCDC1_NUM_VOLTAGES, AXP717_DCDC1_CONTROL, AXP717_DCDC_V_OUT_MASK, - AXP717_DCDC_OUTPUT_CONTROL, BIT(0)), - AXP_DESC_RANGES(AXP717, DCDC2, "dcdc2", "vin2", + AXP717_DCDC_OUTPUT_CONTROL, BIT(0), 640), + AXP_DESC_RANGES_DELAY(AXP717, DCDC2, "dcdc2", "vin2", axp717_dcdc2_ranges, AXP717_DCDC2_NUM_VOLTAGES, AXP717_DCDC2_CONTROL, AXP717_DCDC_V_OUT_MASK, - AXP717_DCDC_OUTPUT_CONTROL, BIT(1)), - AXP_DESC_RANGES(AXP717, DCDC3, "dcdc3", "vin3", + AXP717_DCDC_OUTPUT_CONTROL, BIT(1), 640), + AXP_DESC_RANGES_DELAY(AXP717, DCDC3, "dcdc3", "vin3", axp717_dcdc3_ranges, AXP717_DCDC3_NUM_VOLTAGES, AXP717_DCDC3_CONTROL, AXP717_DCDC_V_OUT_MASK, - AXP717_DCDC_OUTPUT_CONTROL, BIT(2)), - AXP_DESC(AXP717, DCDC4, "dcdc4", "vin4", 1000, 3700, 100, + AXP717_DCDC_OUTPUT_CONTROL, BIT(2), 640), + AXP_DESC_DELAY(AXP717, DCDC4, "dcdc4", "vin4", 1000, 3700, 100, AXP717_DCDC4_CONTROL, AXP717_DCDC_V_OUT_MASK, - AXP717_DCDC_OUTPUT_CONTROL, BIT(3)), + AXP717_DCDC_OUTPUT_CONTROL, BIT(3), 6400), AXP_DESC(AXP717, ALDO1, "aldo1", "aldoin", 500, 3500, 100, AXP717_ALDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(0)), From c84dda3751e945a67d71cbe3af4474aad24a5794 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 19 Nov 2024 22:30:29 +0100 Subject: [PATCH 076/330] spi: aspeed: Fix an error handling path in aspeed_spi_[read|write]_user() A aspeed_spi_start_user() is not balanced by a corresponding aspeed_spi_stop_user(). Add the missing call. Fixes: e3228ed92893 ("spi: spi-mem: Convert Aspeed SMC driver to spi-mem") Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/4052aa2f9a9ea342fa6af83fa991b55ce5d5819e.1732051814.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- drivers/spi/spi-aspeed-smc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-aspeed-smc.c b/drivers/spi/spi-aspeed-smc.c index 8eb843ddb25f2..e9beae95ddeda 100644 --- a/drivers/spi/spi-aspeed-smc.c +++ b/drivers/spi/spi-aspeed-smc.c @@ -239,7 +239,7 @@ static ssize_t aspeed_spi_read_user(struct aspeed_spi_chip *chip, ret = aspeed_spi_send_cmd_addr(chip, op->addr.nbytes, offset, op->cmd.opcode); if (ret < 0) - return ret; + goto stop_user; if (op->dummy.buswidth && op->dummy.nbytes) { for (i = 0; i < op->dummy.nbytes / op->dummy.buswidth; i++) @@ -249,8 +249,9 @@ static ssize_t aspeed_spi_read_user(struct aspeed_spi_chip *chip, aspeed_spi_set_io_mode(chip, io_mode); aspeed_spi_read_from_ahb(buf, chip->ahb_base, len); +stop_user: aspeed_spi_stop_user(chip); - return 0; + return ret; } static ssize_t aspeed_spi_write_user(struct aspeed_spi_chip *chip, @@ -261,10 +262,11 @@ static ssize_t aspeed_spi_write_user(struct aspeed_spi_chip *chip, aspeed_spi_start_user(chip); ret = aspeed_spi_send_cmd_addr(chip, op->addr.nbytes, op->addr.val, op->cmd.opcode); if (ret < 0) - return ret; + goto stop_user; aspeed_spi_write_to_ahb(chip->ahb_base, op->data.buf.out, op->data.nbytes); +stop_user: aspeed_spi_stop_user(chip); - return 0; + return ret; } /* support for 1-1-1, 1-1-2 or 1-1-4 */ From 0bb394067a792e7119abc9e0b7158ef19381f456 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Fri, 6 Dec 2024 19:50:55 +0000 Subject: [PATCH 077/330] spi: rockchip: Fix PM runtime count on no-op cs The early bail out that caused an out-of-bounds write was removed with commit 5c018e378f91 ("spi: spi-rockchip: Fix out of bounds array access") Unfortunately that caused the PM runtime count to be unbalanced and underflowed on the first call. To fix that reintroduce a no-op check by reading the register directly. Cc: stable@vger.kernel.org Fixes: 5c018e378f91 ("spi: spi-rockchip: Fix out of bounds array access") Signed-off-by: Christian Loehle Link: https://patch.msgid.link/1f2b3af4-2b7a-4ac8-ab95-c80120ebf44c@arm.com Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 864e581674173..1bc012fce7cb8 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -241,6 +241,20 @@ static void rockchip_spi_set_cs(struct spi_device *spi, bool enable) struct spi_controller *ctlr = spi->controller; struct rockchip_spi *rs = spi_controller_get_devdata(ctlr); bool cs_asserted = spi->mode & SPI_CS_HIGH ? enable : !enable; + bool cs_actual; + + /* + * SPI subsystem tries to avoid no-op calls that would break the PM + * refcount below. It can't however for the first time it is used. + * To detect this case we read it here and bail out early for no-ops. + */ + if (spi_get_csgpiod(spi, 0)) + cs_actual = !!(readl_relaxed(rs->regs + ROCKCHIP_SPI_SER) & 1); + else + cs_actual = !!(readl_relaxed(rs->regs + ROCKCHIP_SPI_SER) & + BIT(spi_get_chipselect(spi, 0))); + if (unlikely(cs_actual == cs_asserted)) + return; if (cs_asserted) { /* Keep things powered as long as CS is asserted */ From 25fb0e77b90e290a1ca30900d54c6a495eea65e2 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Wed, 4 Dec 2024 14:33:38 +0800 Subject: [PATCH 078/330] spi: spi-cadence-qspi: Disable STIG mode for Altera SoCFPGA. STIG mode is enabled by default for less than 8 bytes data read/write. STIG mode doesn't work with Altera SocFPGA platform due hardware limitation. Add a quirks to disable STIG mode for Altera SoCFPGA platform. Signed-off-by: Niravkumar L Rabara Link: https://patch.msgid.link/20241204063338.296959-1-niravkumar.l.rabara@intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 0b45b7b2b3ab3..a031ecb358e08 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -43,6 +43,7 @@ static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); #define CQSPI_SLOW_SRAM BIT(4) #define CQSPI_NEEDS_APB_AHB_HAZARD_WAR BIT(5) #define CQSPI_RD_NO_IRQ BIT(6) +#define CQSPI_DISABLE_STIG_MODE BIT(7) /* Capabilities */ #define CQSPI_SUPPORTS_OCTAL BIT(0) @@ -103,6 +104,7 @@ struct cqspi_st { bool apb_ahb_hazard; bool is_jh7110; /* Flag for StarFive JH7110 SoC */ + bool disable_stig_mode; const struct cqspi_driver_platdata *ddata; }; @@ -1416,7 +1418,8 @@ static int cqspi_mem_process(struct spi_mem *mem, const struct spi_mem_op *op) * reads, prefer STIG mode for such small reads. */ if (!op->addr.nbytes || - op->data.nbytes <= CQSPI_STIG_DATA_LEN_MAX) + (op->data.nbytes <= CQSPI_STIG_DATA_LEN_MAX && + !cqspi->disable_stig_mode)) return cqspi_command_read(f_pdata, op); return cqspi_read(f_pdata, op); @@ -1880,6 +1883,8 @@ static int cqspi_probe(struct platform_device *pdev) if (ret) goto probe_reset_failed; } + if (ddata->quirks & CQSPI_DISABLE_STIG_MODE) + cqspi->disable_stig_mode = true; if (of_device_is_compatible(pdev->dev.of_node, "xlnx,versal-ospi-1.0")) { @@ -2043,7 +2048,8 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = { static const struct cqspi_driver_platdata socfpga_qspi = { .quirks = CQSPI_DISABLE_DAC_MODE | CQSPI_NO_SUPPORT_WR_COMPLETION - | CQSPI_SLOW_SRAM, + | CQSPI_SLOW_SRAM + | CQSPI_DISABLE_STIG_MODE, }; static const struct cqspi_driver_platdata versal_ospi = { From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 10 Sep 2024 00:44:32 +0000 Subject: [PATCH 079/330] tools: hv: Fix a complier warning in the fcopy uio daemon hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated writing up to 14 bytes into a region of size 10 [-Wformat-truncation=] 436 | snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name); Also added 'static' for the array 'desc[]'. Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver") Cc: stable@vger.kernel.org # 6.10+ Signed-off-by: Dexuan Cui Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com Signed-off-by: Wei Liu Message-ID: <20240910004433.50254-1-decui@microsoft.com> --- tools/hv/hv_fcopy_uio_daemon.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 7a00f3066a980..12743d7f164f0 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -35,8 +35,6 @@ #define WIN8_SRV_MINOR 1 #define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) -#define MAX_FOLDER_NAME 15 -#define MAX_PATH_LEN 15 #define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio" #define FCOPY_VER_COUNT 1 @@ -51,7 +49,7 @@ static const int fw_versions[] = { #define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */ -unsigned char desc[HV_RING_SIZE]; +static unsigned char desc[HV_RING_SIZE]; static int target_fd; static char target_fname[PATH_MAX]; @@ -409,8 +407,8 @@ int main(int argc, char *argv[]) struct vmbus_br txbr, rxbr; void *ring; uint32_t len = HV_RING_SIZE; - char uio_name[MAX_FOLDER_NAME] = {0}; - char uio_dev_path[MAX_PATH_LEN] = {0}; + char uio_name[NAME_MAX] = {0}; + char uio_dev_path[PATH_MAX] = {0}; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Tue, 17 Sep 2024 11:09:17 +0530 Subject: [PATCH 080/330] x86/hyperv: Fix hv tsc page based sched_clock for hibernation read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is bigger than the variable hv_sched_clock_offset, which is cached during early boot, but depending on the timing this assumption may be false when a hibernated VM starts again (the clock counter starts from 0 again) and is resuming back (Note: hv_init_tsc_clocksource() is not called during hibernation/resume); consequently, read_hv_sched_clock_tsc() may return a negative integer (which is interpreted as a huge positive integer since the return type is u64) and new kernel messages are prefixed with huge timestamps before read_hv_sched_clock_tsc() grows big enough (which typically takes several seconds). Fix the issue by saving the Hyper-V clock counter just before the suspend, and using it to correct the hv_sched_clock_offset in resume. This makes hv tsc page based sched_clock continuous and ensures that post resume, it starts from where it left off during suspend. Override x86_platform.save_sched_clock_state and x86_platform.restore_sched_clock_state routines to correct this as soon as possible. Note: if Invariant TSC is available, the issue doesn't happen because 1) we don't register read_hv_sched_clock_tsc() for sched clock: See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework clocksource and sched clock setup"); 2) the common x86 code adjusts TSC similarly: see __restore_processor_state() -> tsc_verify_tsc_adjust(true) and x86_platform.restore_sched_clock_state(). Cc: stable@vger.kernel.org Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation") Co-developed-by: Dexuan Cui Signed-off-by: Dexuan Cui Signed-off-by: Naman Jain Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com> --- arch/x86/kernel/cpu/mshyperv.c | 58 ++++++++++++++++++++++++++++++ drivers/clocksource/hyperv_timer.c | 14 +++++++- include/clocksource/hyperv_timer.h | 2 ++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d18078834deda..dc12fe5ef3caa 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) hyperv_cleanup(); } #endif /* CONFIG_CRASH_DUMP */ + +static u64 hv_ref_counter_at_suspend; +static void (*old_save_sched_clock_state)(void); +static void (*old_restore_sched_clock_state)(void); + +/* + * Hyper-V clock counter resets during hibernation. Save and restore clock + * offset during suspend/resume, while also considering the time passed + * before suspend. This is to make sure that sched_clock using hv tsc page + * based clocksource, proceeds from where it left off during suspend and + * it shows correct time for the timestamps of kernel messages after resume. + */ +static void save_hv_clock_tsc_state(void) +{ + hv_ref_counter_at_suspend = hv_read_reference_counter(); +} + +static void restore_hv_clock_tsc_state(void) +{ + /* + * Adjust the offsets used by hv tsc clocksource to + * account for the time spent before hibernation. + * adjusted value = reference counter (time) at suspend + * - reference counter (time) now. + */ + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); +} + +/* + * Functions to override save_sched_clock_state and restore_sched_clock_state + * functions of x86_platform. The Hyper-V clock counter is reset during + * suspend-resume and the offset used to measure time needs to be + * corrected, post resume. + */ +static void hv_save_sched_clock_state(void) +{ + old_save_sched_clock_state(); + save_hv_clock_tsc_state(); +} + +static void hv_restore_sched_clock_state(void) +{ + restore_hv_clock_tsc_state(); + old_restore_sched_clock_state(); +} + +static void __init x86_setup_ops_for_tsc_pg_clock(void) +{ + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return; + + old_save_sched_clock_state = x86_platform.save_sched_clock_state; + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; + + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; +} #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + x86_setup_ops_for_tsc_pg_clock(); hv_vtl_init_platform(); #endif /* diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 99177835cadec..b39dee7b93af0 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -27,7 +27,8 @@ #include static struct clock_event_device __percpu *hv_clock_event; -static u64 hv_sched_clock_offset __ro_after_init; +/* Note: offset can hold negative values after hibernation. */ +static u64 hv_sched_clock_offset __read_mostly; /* * If false, we're using the old mechanism for stimer0 interrupts @@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg) hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } +/* + * Called during resume from hibernation, from overridden + * x86_platform.restore_sched_clock_state routine. This is to adjust offsets + * used to calculate time for hv tsc page based sched_clock, to account for + * time spent before hibernation. + */ +void hv_adj_sched_clock_offset(u64 offset) +{ + hv_sched_clock_offset -= offset; +} + #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 6cdc873ac907f..aa5233b1eba97 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void); extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); +extern void hv_adj_sched_clock_offset(u64 offset); + static __always_inline bool hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) From 91ae69c7ed9e262f24240c425ad1eef2cf6639b7 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 16 Oct 2024 16:35:10 +0200 Subject: [PATCH 081/330] tools: hv: change permissions of NetworkManager configuration file Align permissions of the resulting .nmconnection file, instead of the input file from hv_kvp_daemon. To avoid the tiny time frame where the output file is world-readable, use umask instead of chmod. Fixes: 42999c904612 ("hv/hv_kvp_daemon:Support for keyfile based connection profile") Signed-off-by: Olaf Hering Reviewed-by: Shradha Gupta Link: https://lore.kernel.org/r/20241016143521.3735-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241016143521.3735-1-olaf@aepfle.de> --- tools/hv/hv_set_ifconfig.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh index 440a91b35823b..2f8baed2b8f79 100755 --- a/tools/hv/hv_set_ifconfig.sh +++ b/tools/hv/hv_set_ifconfig.sh @@ -81,7 +81,7 @@ echo "ONBOOT=yes" >> $1 cp $1 /etc/sysconfig/network-scripts/ -chmod 600 $2 +umask 0177 interface=$(echo $2 | awk -F - '{ print $2 }') filename="${2##*/}" From 67b5e1042d90d8a9814f22312c1147b4c9cd501a Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:36 +0000 Subject: [PATCH 082/330] drivers: hv: Convert open-coded timeouts to secs_to_jiffies() We have several places where timeouts are open-coded as N (seconds) * HZ, but best practice is to use the utility functions from jiffies.h. Convert the timeouts to be compliant. This doesn't fix any bugs, it's a simple code improvement. Signed-off-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20241030-open-coded-timeouts-v3-2-9ba123facf88@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20241030-open-coded-timeouts-v3-2-9ba123facf88@linux.microsoft.com> --- drivers/hv/hv_balloon.c | 9 +++++---- drivers/hv/hv_kvp.c | 4 ++-- drivers/hv/hv_snapshot.c | 3 ++- drivers/hv/vmbus_drv.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index c38dcdfcb914d..a99112e6f0b85 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -756,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, * adding succeeded, it is ok to proceed even if the memory was * not onlined in time. */ - wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5)); post_status(&dm_device); } } @@ -1373,7 +1373,8 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout(&dm_device.config_event, 1 * HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, + secs_to_jiffies(1)); /* * The host expects us to post information on the memory * pressure every second. @@ -1748,7 +1749,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1806,7 +1807,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index d35b60c061148..29e01247a0870 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context) if (host_negotiatied == NEGO_NOT_STARTED) { host_negotiatied = NEGO_IN_PROGRESS; schedule_delayed_work(&kvp_host_handshake_work, - HV_UTIL_NEGO_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT)); } return; } @@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context) */ schedule_work(&kvp_sendkey_work); schedule_delayed_work(&kvp_timeout_work, - HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_TIMEOUT)); return; diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 0d2184be16912..86d87486ed40b 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -193,7 +193,8 @@ static void vss_send_op(void) vss_transaction.state = HVUTIL_USERSPACE_REQ; schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ? - VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(VSS_FREEZE_TIMEOUT) : + secs_to_jiffies(HV_UTIL_TIMEOUT)); rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL); if (rc) { diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 6d89d37b069ad..2892b8da20a5e 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2507,7 +2507,7 @@ static int vmbus_bus_resume(struct device *dev) vmbus_request_offers(); if (wait_for_completion_timeout( - &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0) + &vmbus_connection.ready_for_resume_event, secs_to_jiffies(10)) == 0) pr_err("Some vmbus device is missing after suspending?\n"); /* Reset the event for the next suspend. */ From a9640fcdd400463442846677e62b8208b81cb031 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 5 Nov 2024 09:14:04 +0100 Subject: [PATCH 083/330] tools/hv: terminate fcopy daemon if read from uio fails Terminate endless loop in reading fails, to avoid flooding syslog. This happens if the state of "Guest services" integration service is changed from "enabled" to "disabled" at runtime in the VM settings. In this case pread returns EIO. Also handle an interrupted system call, and continue in this case. Signed-off-by: Olaf Hering Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20241105081437.15689-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241105081437.15689-1-olaf@aepfle.de> --- tools/hv/hv_fcopy_uio_daemon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 12743d7f164f0..0198321d14a29 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -466,8 +466,10 @@ int main(int argc, char *argv[]) */ ret = pread(fcopy_fd, &tmp, sizeof(int), 0); if (ret < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; syslog(LOG_ERR, "pread failed: %s", strerror(errno)); - continue; + goto close; } len = HV_RING_SIZE; From 96e052d1473843d644ceba2adf46d3d2180b8ca7 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:46 -0800 Subject: [PATCH 084/330] Drivers: hv: util: Don't force error code to ENODEV in util_probe() If the util_init function call in util_probe() returns an error code, util_probe() always return ENODEV, and the error code from the util_init function is lost. The error message output in the caller, vmbus_probe(), doesn't show the real error code. Fix this by just returning the error code from the util_init function. There doesn't seem to be a reason to force ENODEV, as other errors such as ENOMEM can already be returned from util_probe(). And the code in call_driver_probe() implies that ENODEV should mean that a matching driver wasn't found, which is not the case here. Suggested-by: Dexuan Cui Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-2-mhklinux@outlook.com> --- drivers/hv/hv_util.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index c4f525325790f..3707222201344 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -590,10 +590,8 @@ static int util_probe(struct hv_device *dev, srv->channel = dev->channel; if (srv->util_init) { ret = srv->util_init(srv); - if (ret) { - ret = -ENODEV; + if (ret) goto error1; - } } /* From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: [PATCH 085/330] Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- drivers/hv/hv_kvp.c | 6 ++++++ drivers/hv/hv_snapshot.c | 6 ++++++ drivers/hv/hv_util.c | 9 +++++++++ drivers/hv/hyperv_vmbus.h | 2 ++ include/linux/hyperv.h | 1 + 5 files changed, 24 insertions(+) diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 29e01247a0870..7400a5a4d2bd7 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) */ kvp_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_kvp_init_transport(void) +{ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, kvp_on_msg, kvp_on_reset); if (!hvt) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 86d87486ed40b..bde637a96c379 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -389,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv) */ vss_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_vss_init_transport(void) +{ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, vss_on_msg, vss_on_reset); if (!hvt) { diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 3707222201344..36ee89c0358bd 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = { static struct hv_util_service util_kvp = { .util_cb = hv_kvp_onchannelcallback, .util_init = hv_kvp_init, + .util_init_transport = hv_kvp_init_transport, .util_pre_suspend = hv_kvp_pre_suspend, .util_pre_resume = hv_kvp_pre_resume, .util_deinit = hv_kvp_deinit, @@ -149,6 +150,7 @@ static struct hv_util_service util_kvp = { static struct hv_util_service util_vss = { .util_cb = hv_vss_onchannelcallback, .util_init = hv_vss_init, + .util_init_transport = hv_vss_init_transport, .util_pre_suspend = hv_vss_pre_suspend, .util_pre_resume = hv_vss_pre_resume, .util_deinit = hv_vss_deinit, @@ -611,6 +613,13 @@ static int util_probe(struct hv_device *dev, if (ret) goto error; + if (srv->util_init_transport) { + ret = srv->util_init_transport(); + if (ret) { + vmbus_close(dev->channel); + goto error; + } + } return 0; error: diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index d2856023d53c9..52cb744b4d7fd 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data); void vmbus_on_msg_dpc(unsigned long data); int hv_kvp_init(struct hv_util_service *srv); +int hv_kvp_init_transport(void); void hv_kvp_deinit(void); int hv_kvp_pre_suspend(void); int hv_kvp_pre_resume(void); void hv_kvp_onchannelcallback(void *context); int hv_vss_init(struct hv_util_service *srv); +int hv_vss_init_transport(void); void hv_vss_deinit(void); int hv_vss_pre_suspend(void); int hv_vss_pre_resume(void); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb910421..02a226bcf0edc 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); From 07dfa6e821e1c58cbd0f195173dddbd593721f9b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 12 Nov 2024 16:04:01 +0100 Subject: [PATCH 086/330] hv/hv_kvp_daemon: Pass NIC name to hv_get_dns_info as well The reference implementation of hv_get_dns_info which is in the tree uses /etc/resolv.conf to get DNS servers and this does not require to know which NIC is queried. Distro specific implementations, however, may want to provide per-NIC, fine grained information. E.g. NetworkManager keeps track of DNS servers per connection. Similar to hv_get_dhcp_info, pass NIC name as a parameter to hv_get_dns_info script. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20241112150401.217094-1-vkuznets@redhat.com Signed-off-by: Wei Liu Message-ID: <20241112150401.217094-1-vkuznets@redhat.com> --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index ae57bf69ad4af..296a7a62c54d3 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name, * . */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s", "hv_get_dns_info"); + sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name); /* * Execute the command to gather DNS info. From a4d024fe2e77063069c5f423f2f9be766450f0f9 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 13:04:10 +0100 Subject: [PATCH 087/330] tools/hv: reduce resouce usage in hv_get_dns_info helper Remove the usage of cat. Replace the shell process with awk with 'exec'. Also use a generic shell because no bash specific features will be used. Signed-off-by: Olaf Hering Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241202120432.21115-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202120432.21115-1-olaf@aepfle.de> --- tools/hv/hv_get_dns_info.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/hv/hv_get_dns_info.sh b/tools/hv/hv_get_dns_info.sh index 058c17b46ffcd..268521234d4b0 100755 --- a/tools/hv/hv_get_dns_info.sh +++ b/tools/hv/hv_get_dns_info.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # This example script parses /etc/resolv.conf to retrive DNS information. # In the interest of keeping the KVP daemon code free of distro specific @@ -10,4 +10,4 @@ # this script can be based on the Network Manager APIs for retrieving DNS # entries. -cat /etc/resolv.conf 2>/dev/null | awk '/^nameserver/ { print $2 }' +exec awk '/^nameserver/ { print $2 }' /etc/resolv.conf 2>/dev/null From becc7fe329c09a7744fa908fca83418fa94a45a0 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 13:40:52 +0100 Subject: [PATCH 088/330] tools/hv: add a .gitignore file Remove generated files from 'git status' output after 'make -C tools/hv'. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/r/20241202124107.28650-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202124107.28650-1-olaf@aepfle.de> --- tools/hv/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/hv/.gitignore diff --git a/tools/hv/.gitignore b/tools/hv/.gitignore new file mode 100644 index 0000000000000..0c5bc15d602f0 --- /dev/null +++ b/tools/hv/.gitignore @@ -0,0 +1,3 @@ +hv_fcopy_uio_daemon +hv_kvp_daemon +hv_vss_daemon From 175c71c2aceef173ae6d3dceb41edfc2ac0d5937 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Sun, 8 Dec 2024 23:47:17 +0000 Subject: [PATCH 089/330] tools/hv: reduce resource usage in hv_kvp_daemon hv_kvp_daemon uses popen(3) and system(3) as convinience helper to launch external helpers. These helpers are invoked via a temporary shell process. There is no need to keep this temporary process around while the helper runs. Replace this temporary shell with the actual helper process via 'exec'. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/linux-hyperv/20241202123520.27812-1-olaf@aepfle.de/ Signed-off-by: Wei Liu --- tools/hv/hv_kvp_daemon.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 296a7a62c54d3..04ba035d67e9d 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name, * . */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name); + sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dns_info", if_name); /* * Execute the command to gather DNS info. @@ -742,7 +742,7 @@ static void kvp_get_ipconfig_info(char *if_name, * Enabled: DHCP enabled. */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dhcp_info", if_name); + sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dhcp_info", if_name); file = popen(cmd, "r"); if (file == NULL) @@ -1606,8 +1606,9 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) * invoke the external script to do its magic. */ - str_len = snprintf(cmd, sizeof(cmd), KVP_SCRIPTS_PATH "%s %s %s", - "hv_set_ifconfig", if_filename, nm_filename); + str_len = snprintf(cmd, sizeof(cmd), "exec %s %s %s", + KVP_SCRIPTS_PATH "hv_set_ifconfig", + if_filename, nm_filename); /* * This is a little overcautious, but it's necessary to suppress some * false warnings from gcc 8.0.1. From 7899ca9f3bd2b008e9a7c41f2a9f1986052d7e96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 2 Dec 2024 12:06:13 +0200 Subject: [PATCH 090/330] ACPI: resource: Fix memory resource type union access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In acpi_decode_space() addr->info.mem.caching is checked on main level for any resource type but addr->info.mem is part of union and thus valid only if the resource type is memory range. Move the check inside the preceeding switch/case to only execute it when the union is of correct type. Fixes: fcb29bbcd540 ("ACPI: Add prefetch decoding to the address space parser") Signed-off-by: Ilpo Järvinen Link: https://patch.msgid.link/20241202100614.20731-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 7fe842dae1ec0..821867de43bea 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -250,6 +250,9 @@ static bool acpi_decode_space(struct resource_win *win, switch (addr->resource_type) { case ACPI_MEMORY_RANGE: acpi_dev_memresource_flags(res, len, wp); + + if (addr->info.mem.caching == ACPI_PREFETCHABLE_MEMORY) + res->flags |= IORESOURCE_PREFETCH; break; case ACPI_IO_RANGE: acpi_dev_ioresource_flags(res, len, iodec, @@ -265,9 +268,6 @@ static bool acpi_decode_space(struct resource_win *win, if (addr->producer_consumer == ACPI_PRODUCER) res->flags |= IORESOURCE_WINDOW; - if (addr->info.mem.caching == ACPI_PREFETCHABLE_MEMORY) - res->flags |= IORESOURCE_PREFETCH; - return !(res->flags & IORESOURCE_DISABLED); } From 6d44a780635b9b3fb6e20f366ee7750a9adfa884 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 9 Dec 2024 11:25:04 -0600 Subject: [PATCH 091/330] smb3: fix compiler warning in reparse code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit utf8s_to_utf16s() specifies pwcs as a wchar_t pointer (whether big endian or little endian is passed in as an additional parm), so to remove a distracting compile warning it needs to be cast as (wchar_t *) in parse_reparse_wsl_symlink() as done by other callers. Fixes: 06a7adf318a3 ("cifs: Add support for parsing WSL-style symlinks") Reviewed-by: Pali Rohár Signed-off-by: Steve French --- fs/smb/client/reparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 50b1aba200089..d88b41133e00c 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -676,7 +676,7 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf return -ENOMEM; symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len, UTF16_LITTLE_ENDIAN, - symname_utf16, symname_utf8_len * 2); + (wchar_t *) symname_utf16, symname_utf8_len * 2); if (symname_utf16_len < 0) { kfree(symname_utf16); return symname_utf16_len; From 246dfe3dc199246bd64635163115f2691623fc53 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Mon, 9 Dec 2024 21:42:26 +0800 Subject: [PATCH 092/330] perf ftrace: Fix undefined behavior in cmp_profile_data() The comparison function cmp_profile_data() violates the C standard's requirements for qsort() comparison functions, which mandate symmetry and transitivity: * Symmetry: If x < y, then y > x. * Transitivity: If x < y and y < z, then x < z. When v1 and v2 are equal, the function incorrectly returns 1, breaking symmetry and transitivity. This causes undefined behavior, which can lead to memory corruption in certain versions of glibc [1]. Fix the issue by returning 0 when v1 and v2 are equal, ensuring compliance with the C standard and preventing undefined behavior. Link: https://www.qualys.com/2024/01/30/qsort.txt [1] Fixes: 0f223813edd0 ("perf ftrace: Add 'profile' command") Fixes: 74ae366c37b7 ("perf ftrace profile: Add -s/--sort option") Cc: stable@vger.kernel.org Signed-off-by: Kuan-Wei Chiu Reviewed-by: Namhyung Kim Reviewed-by: Arnaldo Carvalho de Melo Cc: jserv@ccns.ncku.edu.tw Cc: chuang@cs.nycu.edu.tw Link: https://lore.kernel.org/r/20241209134226.1939163-1-visitorckw@gmail.com Signed-off-by: Namhyung Kim --- tools/perf/builtin-ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 272d3c70810e7..a56cf8b0a7d40 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -1151,8 +1151,9 @@ static int cmp_profile_data(const void *a, const void *b) if (v1 > v2) return -1; - else + if (v1 < v2) return 1; + return 0; } static void print_profile_result(struct perf_ftrace *ftrace) From 3f61a12b08bd84e6cf705484c2cb9d5b891882fa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 5 Dec 2024 20:23:05 -0800 Subject: [PATCH 093/330] perf hwmon_pmu: Use openat rather than dup to refresh directory The hwmon PMU test will make a temp directory, open the directory with O_DIRECTORY then fill it with contents. As the open is before the filling the contents the later fdopendir may reflect the initial empty state, meaning no events are seen. Change to re-open the directory, rather than dup the fd, so the latest contents are seen. Minor tweaks/additions to debug messages. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20241206042306.1055913-1-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/hwmon_pmu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/hwmon_pmu.c b/tools/perf/util/hwmon_pmu.c index e61429b38ba7b..4acb9bb19b846 100644 --- a/tools/perf/util/hwmon_pmu.c +++ b/tools/perf/util/hwmon_pmu.c @@ -258,8 +258,12 @@ static int hwmon_pmu__read_events(struct hwmon_pmu *pmu) if (pmu->pmu.sysfs_aliases_loaded) return 0; - /* Use a dup-ed fd as closedir will close it. */ - dup_fd = dup(pmu->hwmon_dir_fd); + /* + * Use a dup-ed fd as closedir will close it. Use openat so that the + * directory contents are refreshed. + */ + dup_fd = openat(pmu->hwmon_dir_fd, ".", O_DIRECTORY); + if (dup_fd == -1) return -ENOMEM; @@ -336,6 +340,9 @@ static int hwmon_pmu__read_events(struct hwmon_pmu *pmu) close(fd); } } + if (hashmap__size(&pmu->events) == 0) + pr_debug2("hwmon_pmu: %s has no events\n", pmu->pmu.name); + hashmap__for_each_entry_safe((&pmu->events), cur, tmp, bkt) { union hwmon_pmu_event_key key = { .type_and_num = cur->key, @@ -343,8 +350,8 @@ static int hwmon_pmu__read_events(struct hwmon_pmu *pmu) struct hwmon_pmu_event_value *value = cur->pvalue; if (!test_bit(HWMON_ITEM_INPUT, value->items)) { - pr_debug("hwmon_pmu: removing event '%s%d' that has no input file\n", - hwmon_type_strs[key.type], key.num); + pr_debug("hwmon_pmu: %s removing event '%s%d' that has no input file\n", + pmu->pmu.name, hwmon_type_strs[key.type], key.num); hashmap__delete(&pmu->events, key.type_and_num, &key, &value); zfree(&value->label); zfree(&value->name); From d4e17a322a8fc3266798ec8dee1fbe679078b2bc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 5 Dec 2024 20:23:06 -0800 Subject: [PATCH 094/330] perf test hwmon_pmu: Fix event file location The temp directory is made and a known fake hwmon PMU created within it. Prior to this fix the events were being incorrectly written to the temp directory rather than the fake PMU directory. This didn't impact the test as the directory fd matched the wrong location, but it doesn't mirror what a hwmon PMU would actually look like. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20241206042306.1055913-2-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/tests/hwmon_pmu.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/tools/perf/tests/hwmon_pmu.c b/tools/perf/tests/hwmon_pmu.c index f8bcee9660d53..d2b066a2b557a 100644 --- a/tools/perf/tests/hwmon_pmu.c +++ b/tools/perf/tests/hwmon_pmu.c @@ -65,7 +65,7 @@ static struct perf_pmu *test_pmu_get(char *dir, size_t sz) { "temp2_label", "test hwmon event2\n", }, { "temp2_input", "50000\n", }, }; - int dirfd, file; + int hwmon_dirfd = -1, test_dirfd = -1, file; struct perf_pmu *hwm = NULL; ssize_t len; @@ -76,19 +76,24 @@ static struct perf_pmu *test_pmu_get(char *dir, size_t sz) dir[0] = '\0'; return NULL; } - dirfd = open(dir, O_DIRECTORY); - if (dirfd < 0) { + test_dirfd = open(dir, O_PATH|O_DIRECTORY); + if (test_dirfd < 0) { pr_err("Failed to open test directory \"%s\"\n", dir); goto err_out; } /* Create the test hwmon directory and give it a name. */ - if (mkdirat(dirfd, "hwmon1234", 0755) < 0) { + if (mkdirat(test_dirfd, "hwmon1234", 0755) < 0) { pr_err("Failed to mkdir hwmon directory\n"); goto err_out; } - file = openat(dirfd, "hwmon1234/name", O_WRONLY | O_CREAT, 0600); - if (!file) { + hwmon_dirfd = openat(test_dirfd, "hwmon1234", O_DIRECTORY); + if (hwmon_dirfd < 0) { + pr_err("Failed to open test hwmon directory \"%s/hwmon1234\"\n", dir); + goto err_out; + } + file = openat(hwmon_dirfd, "name", O_WRONLY | O_CREAT, 0600); + if (file < 0) { pr_err("Failed to open for writing file \"name\"\n"); goto err_out; } @@ -104,8 +109,8 @@ static struct perf_pmu *test_pmu_get(char *dir, size_t sz) for (size_t i = 0; i < ARRAY_SIZE(test_items); i++) { const struct test_item *item = &test_items[i]; - file = openat(dirfd, item->name, O_WRONLY | O_CREAT, 0600); - if (!file) { + file = openat(hwmon_dirfd, item->name, O_WRONLY | O_CREAT, 0600); + if (file < 0) { pr_err("Failed to open for writing file \"%s\"\n", item->name); goto err_out; } @@ -119,16 +124,18 @@ static struct perf_pmu *test_pmu_get(char *dir, size_t sz) } /* Make the PMU reading the files created above. */ - hwm = perf_pmus__add_test_hwmon_pmu(dirfd, "hwmon1234", test_hwmon_name); + hwm = perf_pmus__add_test_hwmon_pmu(hwmon_dirfd, "hwmon1234", test_hwmon_name); if (!hwm) pr_err("Test hwmon creation failed\n"); err_out: if (!hwm) { test_pmu_put(dir, hwm); - if (dirfd >= 0) - close(dirfd); + if (hwmon_dirfd >= 0) + close(hwmon_dirfd); } + if (test_dirfd >= 0) + close(test_dirfd); return hwm; } From 4011b351b1b5a953aaa7c6b3915f908b3cc1be96 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 26 Nov 2024 00:33:32 +0100 Subject: [PATCH 095/330] drm/panic: remove spurious empty line to clean warning Clippy in the upcoming Rust 1.83.0 spots a spurious empty line since the `clippy::empty_line_after_doc_comments` warning is now enabled by default given it is part of the `suspicious` group [1]: error: empty line after doc comment --> drivers/gpu/drm/drm_panic_qr.rs:931:1 | 931 | / /// They must remain valid for the duration of the function call. 932 | | | |_ 933 | #[no_mangle] 934 | / pub unsafe extern "C" fn drm_panic_qr_generate( 935 | | url: *const i8, 936 | | data: *mut u8, 937 | | data_len: usize, ... | 940 | | tmp_size: usize, 941 | | ) -> u8 { | |_______- the comment documents this function | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#empty_line_after_doc_comments = note: `-D clippy::empty-line-after-doc-comments` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(clippy::empty_line_after_doc_comments)]` = help: if the empty line is unintentional remove it Thus remove the empty line. Cc: stable@vger.kernel.org Fixes: cb5164ac43d0 ("drm/panic: Add a QR code panic screen") Link: https://github.com/rust-lang/rust-clippy/pull/13091 [1] Reviewed-by: Jocelyn Falempe Link: https://lore.kernel.org/r/20241125233332.697497-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- drivers/gpu/drm/drm_panic_qr.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/drm_panic_qr.rs b/drivers/gpu/drm/drm_panic_qr.rs index 09500cddc009b..ef2d490965ba2 100644 --- a/drivers/gpu/drm/drm_panic_qr.rs +++ b/drivers/gpu/drm/drm_panic_qr.rs @@ -929,7 +929,6 @@ impl QrImage<'_> { /// * `tmp` must be valid for reading and writing for `tmp_size` bytes. /// /// They must remain valid for the duration of the function call. - #[no_mangle] pub unsafe extern "C" fn drm_panic_qr_generate( url: *const i8, From 1f806218164d1bb93f3db21eaf61254b08acdf03 Mon Sep 17 00:00:00 2001 From: "Luis Claudio R. Goncalves" Date: Fri, 6 Dec 2024 10:01:14 -0300 Subject: [PATCH 096/330] iommu/tegra241-cmdqv: do not use smp_processor_id in preemptible context During boot some of the calls to tegra241_cmdqv_get_cmdq() will happen in preemptible context. As this function calls smp_processor_id(), if CONFIG_DEBUG_PREEMPT is enabled, these calls will trigger a series of "BUG: using smp_processor_id() in preemptible" backtraces. As tegra241_cmdqv_get_cmdq() only calls smp_processor_id() to use the CPU number as a factor to balance out traffic on cmdq usage, it is safe to use raw_smp_processor_id() here. Cc: Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV") Signed-off-by: Luis Claudio R. Goncalves Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Link: https://lore.kernel.org/r/Z1L1mja3nXzsJ0Pk@uudg.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index c8ec74f089f3d..6e41ddaa24d63 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -339,7 +339,7 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, * one CPU at a time can enter the process, while the others * will be spinning at the same lock. */ - lidx = smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf; + lidx = raw_smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf; vcmdq = vintf->lvcmdqs[lidx]; if (!vcmdq || !READ_ONCE(vcmdq->enabled)) return NULL; From 7a5f93ea5862da91488975acaa0c7abd508f192b Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 23 Nov 2024 19:03:23 +0100 Subject: [PATCH 097/330] rust: kbuild: set `bindgen`'s Rust target version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each `bindgen` release may upgrade the list of Rust targets. For instance, currently, in their master branch [1], the latest ones are: Nightly => { vectorcall_abi: #124485, ptr_metadata: #81513, layout_for_ptr: #69835, }, Stable_1_77(77) => { offset_of: #106655 }, Stable_1_73(73) => { thiscall_abi: #42202 }, Stable_1_71(71) => { c_unwind_abi: #106075 }, Stable_1_68(68) => { abi_efiapi: #105795 }, By default, the highest stable release in their list is used, and users are expected to set one if they need to support older Rust versions (e.g. see [2]). Thus, over time, new Rust features are used by default, and at some point, it is likely that `bindgen` will emit Rust code that requires a Rust version higher than our minimum (or perhaps enabling an unstable feature). Currently, there is no problem because the maximum they have, as seen above, is Rust 1.77.0, and our current minimum is Rust 1.78.0. Therefore, set a Rust target explicitly now to prevent going forward in time too much and thus getting potential build failures at some point. Since we also support a minimum `bindgen` version, and since `bindgen` does not support passing unknown Rust target versions, we need to use the list of our minimum `bindgen` version, rather than the latest. So, since `bindgen` 0.65.1 had this list [3], we need to use Rust 1.68.0: /// Rust stable 1.64 /// * `core_ffi_c` ([Tracking issue](https://github.com/rust-lang/rust/issues/94501)) => Stable_1_64 => 1.64; /// Rust stable 1.68 /// * `abi_efiapi` calling convention ([Tracking issue](https://github.com/rust-lang/rust/issues/65815)) => Stable_1_68 => 1.68; /// Nightly rust /// * `thiscall` calling convention ([Tracking issue](https://github.com/rust-lang/rust/issues/42202)) /// * `vectorcall` calling convention (no tracking issue) /// * `c_unwind` calling convention ([Tracking issue](https://github.com/rust-lang/rust/issues/74990)) => Nightly => nightly; ... /// Latest stable release of Rust pub const LATEST_STABLE_RUST: RustTarget = RustTarget::Stable_1_68; Thus add the `--rust-target 1.68` parameter. Add a comment as well explaining this. An alternative would be to use the currently running (i.e. actual) `rustc` and `bindgen` versions to pick a "better" Rust target version. However, that would introduce more moving parts depending on the user setup and is also more complex to implement. Starting with `bindgen` 0.71.0 [4], we will be able to set any future Rust version instead, i.e. we will be able to set here our minimum supported Rust version. Christian implemented it [5] after seeing this patch. Thanks! Cc: Christian Poveda Cc: Emilio Cobos Álvarez Cc: stable@vger.kernel.org # needed for 6.12.y; unneeded for 6.6.y; do not apply to 6.1.y Fixes: c844fa64a2d4 ("rust: start supporting several `bindgen` versions") Link: https://github.com/rust-lang/rust-bindgen/blob/21c60f473f4e824d4aa9b2b508056320d474b110/bindgen/features.rs#L97-L105 [1] Link: https://github.com/rust-lang/rust-bindgen/issues/2960 [2] Link: https://github.com/rust-lang/rust-bindgen/blob/7d243056d335fdc4537f7bca73c06d01aae24ddc/bindgen/features.rs#L131-L150 [3] Link: https://github.com/rust-lang/rust-bindgen/blob/main/CHANGELOG.md#0710-2024-12-06 [4] Link: https://github.com/rust-lang/rust-bindgen/pull/2993 [5] Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20241123180323.255997-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/Makefile | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index 9da9042fd6279..a40a3936126d6 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -280,9 +280,22 @@ endif # architecture instead of generating `usize`. bindgen_c_flags_final = $(bindgen_c_flags_lto) -fno-builtin -D__BINDGEN__ +# Each `bindgen` release may upgrade the list of Rust target versions. By +# default, the highest stable release in their list is used. Thus we need to set +# a `--rust-target` to avoid future `bindgen` releases emitting code that +# `rustc` may not understand. On top of that, `bindgen` does not support passing +# an unknown Rust target version. +# +# Therefore, the Rust target for `bindgen` can be only as high as the minimum +# Rust version the kernel supports and only as high as the greatest stable Rust +# target supported by the minimum `bindgen` version the kernel supports (that +# is, if we do not test the actual `rustc`/`bindgen` versions running). +# +# Starting with `bindgen` 0.71.0, we will be able to set any future Rust version +# instead, i.e. we will be able to set here our minimum supported Rust version. quiet_cmd_bindgen = BINDGEN $@ cmd_bindgen = \ - $(BINDGEN) $< $(bindgen_target_flags) \ + $(BINDGEN) $< $(bindgen_target_flags) --rust-target 1.68 \ --use-core --with-derive-default --ctypes-prefix ffi --no-layout-tests \ --no-debug '.*' --enable-function-attribute-detection \ -o $@ -- $(bindgen_c_flags_final) -DMODULE \ From f103396ae31851d00b561ff9f8a32a441953ff8b Mon Sep 17 00:00:00 2001 From: liuderong Date: Fri, 6 Dec 2024 15:29:42 +0800 Subject: [PATCH 098/330] scsi: ufs: core: Update compl_time_stamp_local_clock after completing a cqe lrbp->compl_time_stamp_local_clock is set to zero after sending a sqe but it is not updated after completing a cqe. Thus the printed information in ufshcd_print_tr() will always be zero. Update lrbp->cmpl_time_stamp_local_clock after completing a cqe. Log sample: ufshcd-qcom 1d84000.ufshc: UPIU[8] - issue time 8750227249 us ufshcd-qcom 1d84000.ufshc: UPIU[8] - complete time 0 us Fixes: c30d8d010b5e ("scsi: ufs: core: Prepare for completion in MCQ") Reviewed-by: Bean Huo Reviewed-by: Peter Wang Signed-off-by: liuderong Link: https://lore.kernel.org/r/1733470182-220841-1-git-send-email-liuderong@oppo.com Reviewed-by: Avri Altman Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index b7ec5797d5baa..8a01e4393159d 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5556,6 +5556,7 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag, lrbp = &hba->lrb[task_tag]; lrbp->compl_time_stamp = ktime_get(); + lrbp->compl_time_stamp_local_clock = local_clock(); cmd = lrbp->cmd; if (cmd) { if (unlikely(ufshcd_should_inform_monitor(hba, lrbp))) From 8552cb04e0831df3ff265c75ad33f705a45bc731 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 29 Nov 2024 17:53:16 +0800 Subject: [PATCH 099/330] crypto: rsassa-pkcs1 - Copy source data for SG list As virtual addresses in general may not be suitable for DMA, always perform a copy before using them in an SG list. Fixes: 1e562deacecc ("crypto: rsassa-pkcs1 - Migrate to sig_alg backend") Reported-by: Zorro Lang Signed-off-by: Herbert Xu --- crypto/rsassa-pkcs1.c | 45 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/crypto/rsassa-pkcs1.c b/crypto/rsassa-pkcs1.c index 4d077fc960767..f68ffd338f483 100644 --- a/crypto/rsassa-pkcs1.c +++ b/crypto/rsassa-pkcs1.c @@ -163,10 +163,6 @@ static int rsassa_pkcs1_sign(struct crypto_sig *tfm, struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst); const struct hash_prefix *hash_prefix = ictx->hash_prefix; struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); - unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child); - struct akcipher_request *child_req __free(kfree_sensitive) = NULL; - struct scatterlist in_sg[3], out_sg; - struct crypto_wait cwait; unsigned int pad_len; unsigned int ps_end; unsigned int len; @@ -187,37 +183,25 @@ static int rsassa_pkcs1_sign(struct crypto_sig *tfm, pad_len = ctx->key_size - slen - hash_prefix->size - 1; - child_req = kmalloc(sizeof(*child_req) + child_reqsize + pad_len, - GFP_KERNEL); - if (!child_req) - return -ENOMEM; - /* RFC 8017 sec 8.2.1 step 1 - EMSA-PKCS1-v1_5 encoding generation */ - in_buf = (u8 *)(child_req + 1) + child_reqsize; + in_buf = dst; + memmove(in_buf + pad_len + hash_prefix->size, src, slen); + memcpy(in_buf + pad_len, hash_prefix->data, hash_prefix->size); + ps_end = pad_len - 1; in_buf[0] = 0x01; memset(in_buf + 1, 0xff, ps_end - 1); in_buf[ps_end] = 0x00; - /* RFC 8017 sec 8.2.1 step 2 - RSA signature */ - crypto_init_wait(&cwait); - sg_init_table(in_sg, 3); - sg_set_buf(&in_sg[0], in_buf, pad_len); - sg_set_buf(&in_sg[1], hash_prefix->data, hash_prefix->size); - sg_set_buf(&in_sg[2], src, slen); - sg_init_one(&out_sg, dst, dlen); - akcipher_request_set_tfm(child_req, ctx->child); - akcipher_request_set_crypt(child_req, in_sg, &out_sg, - ctx->key_size - 1, dlen); - akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &cwait); - err = crypto_akcipher_decrypt(child_req); - err = crypto_wait_req(err, &cwait); - if (err) + /* RFC 8017 sec 8.2.1 step 2 - RSA signature */ + err = crypto_akcipher_sync_decrypt(ctx->child, in_buf, + ctx->key_size - 1, in_buf, + ctx->key_size); + if (err < 0) return err; - len = child_req->dst_len; + len = err; pad_len = ctx->key_size - len; /* Four billion to one */ @@ -239,8 +223,8 @@ static int rsassa_pkcs1_verify(struct crypto_sig *tfm, struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm); unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child); struct akcipher_request *child_req __free(kfree_sensitive) = NULL; - struct scatterlist in_sg, out_sg; struct crypto_wait cwait; + struct scatterlist sg; unsigned int dst_len; unsigned int pos; u8 *out_buf; @@ -259,13 +243,12 @@ static int rsassa_pkcs1_verify(struct crypto_sig *tfm, return -ENOMEM; out_buf = (u8 *)(child_req + 1) + child_reqsize; + memcpy(out_buf, src, slen); crypto_init_wait(&cwait); - sg_init_one(&in_sg, src, slen); - sg_init_one(&out_sg, out_buf, ctx->key_size); + sg_init_one(&sg, out_buf, slen); akcipher_request_set_tfm(child_req, ctx->child); - akcipher_request_set_crypt(child_req, &in_sg, &out_sg, - slen, ctx->key_size); + akcipher_request_set_crypt(child_req, &sg, &sg, slen, slen); akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &cwait); From cd26cd65476711e2c69e0a049c0eeef4b743f5ac Mon Sep 17 00:00:00 2001 From: Chenghai Huang Date: Sat, 30 Nov 2024 16:01:31 +0800 Subject: [PATCH 100/330] crypto: hisilicon/debugfs - fix the struct pointer incorrectly offset problem Offset based on (id * size) is wrong for sqc and cqc. (*sqc/*cqc + 1) can already offset sizeof(struct(Xqc)) length. Fixes: 15f112f9cef5 ("crypto: hisilicon/debugfs - mask the unnecessary info from the dump") Cc: Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/debugfs.c b/drivers/crypto/hisilicon/debugfs.c index 1b9b7bccdeff0..45e130b901eb5 100644 --- a/drivers/crypto/hisilicon/debugfs.c +++ b/drivers/crypto/hisilicon/debugfs.c @@ -192,7 +192,7 @@ static int qm_sqc_dump(struct hisi_qm *qm, char *s, char *name) down_read(&qm->qps_lock); if (qm->sqc) { - memcpy(&sqc, qm->sqc + qp_id * sizeof(struct qm_sqc), sizeof(struct qm_sqc)); + memcpy(&sqc, qm->sqc + qp_id, sizeof(struct qm_sqc)); sqc.base_h = cpu_to_le32(QM_XQC_ADDR_MASK); sqc.base_l = cpu_to_le32(QM_XQC_ADDR_MASK); dump_show(qm, &sqc, sizeof(struct qm_sqc), "SOFT SQC"); @@ -229,7 +229,7 @@ static int qm_cqc_dump(struct hisi_qm *qm, char *s, char *name) down_read(&qm->qps_lock); if (qm->cqc) { - memcpy(&cqc, qm->cqc + qp_id * sizeof(struct qm_cqc), sizeof(struct qm_cqc)); + memcpy(&cqc, qm->cqc + qp_id, sizeof(struct qm_cqc)); cqc.base_h = cpu_to_le32(QM_XQC_ADDR_MASK); cqc.base_l = cpu_to_le32(QM_XQC_ADDR_MASK); dump_show(qm, &cqc, sizeof(struct qm_cqc), "SOFT CQC"); From b7ffecbe198e2dfc44abf92ceb90f46150f7527a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 9 Dec 2024 20:06:57 -0800 Subject: [PATCH 101/330] memcg: slub: fix SUnreclaim for post charged objects Large kmalloc directly allocates from the page allocator and then use lruvec_stat_mod_folio() to increment the unreclaimable slab stats for global and memcg. However when post memcg charging of slab objects was added in commit 9028cdeb38e1 ("memcg: add charging of already allocated slab objects"), it missed to correctly handle the unreclaimable slab stats for memcg. One user visisble effect of that bug is that the node level unreclaimable slab stat will work correctly but the memcg level stat can underflow as kernel correctly handles the free path but the charge path missed to increment the memcg level unreclaimable slab stat. Let's fix by correctly handle in the post charge code path. Fixes: 9028cdeb38e1 ("memcg: add charging of already allocated slab objects") Signed-off-by: Shakeel Butt Cc: Signed-off-by: Vlastimil Babka --- mm/slub.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 19980419b176c..c2151c9fee228 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2189,9 +2189,24 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) folio = virt_to_folio(p); if (!folio_test_slab(folio)) { - return folio_memcg_kmem(folio) || - (__memcg_kmem_charge_page(folio_page(folio, 0), flags, - folio_order(folio)) == 0); + int size; + + if (folio_memcg_kmem(folio)) + return true; + + if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio))) + return false; + + /* + * This folio has already been accounted in the global stats but + * not in the memcg stats. So, subtract from the global and use + * the interface which adds to both global and memcg stats. + */ + size = folio_size(folio); + node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); + lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); + return true; } slab = folio_slab(folio); From 4a552f7890f0870f6d9fd4fbc6c05cea7bfd4503 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 20:13:41 -0400 Subject: [PATCH 102/330] iommu/amd: Put list_add/del(dev_data) back under the domain->lock The list domain->dev_list is protected by the domain->lock spinlock. Any iteration, addition or removal must be under the lock. Move the list_del() up into the critical section. pdom_is_sva_capable(), and destroy_gcr3_table() do not interact with the list element. Wrap the list_add() in a lock, it would make more sense if this was under the same critical section as adjusting the refcounts earlier, but that requires more complications. Fixes: d6b47dec3684 ("iommu/amd: Reduce domain lock scope in attach device path") Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/1-v1-3b9edcf8067d+3975-amd_dev_list_locking_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3f691e1fd22ce..23a168e229b17 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2073,6 +2073,7 @@ static int attach_device(struct device *dev, struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); struct pci_dev *pdev; + unsigned long flags; int ret = 0; mutex_lock(&dev_data->mutex); @@ -2113,7 +2114,9 @@ static int attach_device(struct device *dev, /* Update data structures */ dev_data->domain = domain; + spin_lock_irqsave(&domain->lock, flags); list_add(&dev_data->list, &domain->dev_list); + spin_unlock_irqrestore(&domain->lock, flags); /* Update device table */ dev_update_dte(dev_data, true); @@ -2160,6 +2163,7 @@ static void detach_device(struct device *dev) /* Flush IOTLB and wait for the flushes to finish */ spin_lock_irqsave(&domain->lock, flags); amd_iommu_domain_flush_all(domain); + list_del(&dev_data->list); spin_unlock_irqrestore(&domain->lock, flags); /* Clear GCR3 table */ @@ -2168,7 +2172,6 @@ static void detach_device(struct device *dev) /* Update data structures */ dev_data->domain = NULL; - list_del(&dev_data->list); /* decrease reference counters - needs to happen after the flushes */ pdom_detach_iommu(iommu, domain); From 91da87d38cca46be6f2f4f3c820f5d817c1866dc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Dec 2024 20:13:42 -0400 Subject: [PATCH 103/330] iommu/amd: Add lockdep asserts for domain->dev_list Add an assertion to all the iteration points that don't obviously have the lock held already. These all take the locker higher in their call chains. Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/2-v1-3b9edcf8067d+3975-amd_dev_list_locking_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 23a168e229b17..16f40b8000d79 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1415,6 +1415,7 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, struct iommu_cmd cmd; int ret = 0; + lockdep_assert_held(&pdom->lock); list_for_each_entry(dev_data, &pdom->dev_list, list) { struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); u16 domid = dev_data->gcr3_info.domid; @@ -1464,6 +1465,8 @@ static void __domain_flush_pages(struct protection_domain *domain, ioasid_t pasid = IOMMU_NO_PASID; bool gn = false; + lockdep_assert_held(&domain->lock); + if (pdom_is_v2_pgtbl_mode(domain)) { gn = true; ret = domain_flush_pages_v2(domain, address, size); @@ -1585,6 +1588,8 @@ void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) { struct iommu_dev_data *dev_data; + lockdep_assert_held(&domain->lock); + list_for_each_entry(dev_data, &domain->dev_list, list) { struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); From eb9640fd1ce666610b77f5997596e9570a36378f Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 4 Dec 2024 09:04:09 +0200 Subject: [PATCH 104/330] gpio: graniterapids: Fix vGPIO driver crash Move setting irq_chip.name from probe() function to the initialization of "irq_chip" struct in order to fix vGPIO driver crash during bootup. Crash was caused by unauthorized modification of irq_chip.name field where irq_chip struct was initialized as const. This behavior is a consequence of suboptimal implementation of gpio_irq_chip_set_chip(), which should be changed to avoid casting away const qualifier. Crash log: BUG: unable to handle page fault for address: ffffffffc0ba81c0 /#PF: supervisor write access in kernel mode /#PF: error_code(0x0003) - permissions violation CPU: 33 UID: 0 PID: 1075 Comm: systemd-udevd Not tainted 6.12.0-rc6-00077-g2e1b3cc9d7f7 #1 Hardware name: Intel Corporation Kaseyville RP/Kaseyville RP, BIOS KVLDCRB1.PGS.0026.D73.2410081258 10/08/2024 RIP: 0010:gnr_gpio_probe+0x171/0x220 [gpio_graniterapids] Cc: stable@vger.kernel.org Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-2-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index f2e911a3d2ca0..9da2999dc30fb 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -234,6 +234,7 @@ static int gnr_gpio_irq_set_type(struct irq_data *d, unsigned int type) } static const struct irq_chip gnr_gpio_irq_chip = { + .name = "gpio-graniterapids", .irq_ack = gnr_gpio_irq_ack, .irq_mask = gnr_gpio_irq_mask, .irq_unmask = gnr_gpio_irq_unmask, @@ -324,7 +325,6 @@ static int gnr_gpio_probe(struct platform_device *pdev) girq = &priv->gc.irq; gpio_irq_chip_set_chip(girq, &gnr_gpio_irq_chip); - girq->chip->name = dev_name(dev); girq->parent_handler = NULL; girq->num_parents = 0; girq->parents = NULL; From 7382d2f0e802077c36495e325da8d253a15fb441 Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 4 Dec 2024 09:04:10 +0200 Subject: [PATCH 105/330] gpio: graniterapids: Fix incorrect BAR assignment Base Address of vGPIO MMIO register is provided directly by the BIOS instead of using offsets. Update address assignment to reflect this change in driver. Cc: stable@vger.kernel.org Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-3-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index 9da2999dc30fb..d2b542b536b60 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -32,7 +32,7 @@ #define GNR_PINS_PER_REG 32 #define GNR_NUM_REGS DIV_ROUND_UP(GNR_NUM_PINS, GNR_PINS_PER_REG) -#define GNR_CFG_BAR 0x00 +#define GNR_CFG_PADBAR 0x00 #define GNR_CFG_LOCK_OFFSET 0x04 #define GNR_GPI_STATUS_OFFSET 0x20 #define GNR_GPI_ENABLE_OFFSET 0x24 @@ -50,6 +50,7 @@ * struct gnr_gpio - Intel Granite Rapids-D vGPIO driver state * @gc: GPIO controller interface * @reg_base: base address of the GPIO registers + * @pad_base: base address of the vGPIO pad configuration registers * @ro_bitmap: bitmap of read-only pins * @lock: guard the registers * @pad_backup: backup of the register state for suspend @@ -57,6 +58,7 @@ struct gnr_gpio { struct gpio_chip gc; void __iomem *reg_base; + void __iomem *pad_base; DECLARE_BITMAP(ro_bitmap, GNR_NUM_PINS); raw_spinlock_t lock; u32 pad_backup[]; @@ -65,7 +67,7 @@ struct gnr_gpio { static void __iomem *gnr_gpio_get_padcfg_addr(const struct gnr_gpio *priv, unsigned int gpio) { - return priv->reg_base + gpio * sizeof(u32); + return priv->pad_base + gpio * sizeof(u32); } static int gnr_gpio_configure_line(struct gpio_chip *gc, unsigned int gpio, @@ -292,6 +294,7 @@ static int gnr_gpio_probe(struct platform_device *pdev) struct gnr_gpio *priv; void __iomem *regs; int irq, ret; + u32 offset; priv = devm_kzalloc(dev, struct_size(priv, pad_backup, num_backup_pins), GFP_KERNEL); if (!priv) @@ -303,6 +306,10 @@ static int gnr_gpio_probe(struct platform_device *pdev) if (IS_ERR(regs)) return PTR_ERR(regs); + priv->reg_base = regs; + offset = readl(priv->reg_base + GNR_CFG_PADBAR); + priv->pad_base = priv->reg_base + offset; + irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; @@ -312,8 +319,6 @@ static int gnr_gpio_probe(struct platform_device *pdev) if (ret) return dev_err_probe(dev, ret, "failed to request interrupt\n"); - priv->reg_base = regs + readl(regs + GNR_CFG_BAR); - gnr_gpio_init_pin_ro_bits(dev, priv->reg_base + GNR_CFG_LOCK_OFFSET, priv->ro_bitmap); From 0fe329b55231cca489f9bed1db0e778d077fdaf9 Mon Sep 17 00:00:00 2001 From: Shankar Bandal Date: Wed, 4 Dec 2024 09:04:11 +0200 Subject: [PATCH 106/330] gpio: graniterapids: Fix invalid GPI_IS register offset Update GPI Interrupt Status register offset to correct value. Cc: stable@vger.kernel.org Signed-off-by: Shankar Bandal Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-4-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index d2b542b536b60..be907784ccdb6 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -34,7 +34,7 @@ #define GNR_CFG_PADBAR 0x00 #define GNR_CFG_LOCK_OFFSET 0x04 -#define GNR_GPI_STATUS_OFFSET 0x20 +#define GNR_GPI_STATUS_OFFSET 0x14 #define GNR_GPI_ENABLE_OFFSET 0x24 #define GNR_CFG_DW_RX_MASK GENMASK(25, 22) From 15636b00a055474033426b94b6372728b2163a1e Mon Sep 17 00:00:00 2001 From: Shankar Bandal Date: Wed, 4 Dec 2024 09:04:12 +0200 Subject: [PATCH 107/330] gpio: graniterapids: Fix invalid RXEVCFG register bitmask Correct RX Level/Edge Configuration register (RXEVCFG) bitmask. Cc: stable@vger.kernel.org Signed-off-by: Shankar Bandal Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-5-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index be907784ccdb6..ec2931a65723b 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -37,7 +37,7 @@ #define GNR_GPI_STATUS_OFFSET 0x14 #define GNR_GPI_ENABLE_OFFSET 0x24 -#define GNR_CFG_DW_RX_MASK GENMASK(25, 22) +#define GNR_CFG_DW_RX_MASK GENMASK(23, 22) #define GNR_CFG_DW_RX_DISABLE FIELD_PREP(GNR_CFG_DW_RX_MASK, 2) #define GNR_CFG_DW_RX_EDGE FIELD_PREP(GNR_CFG_DW_RX_MASK, 1) #define GNR_CFG_DW_RX_LEVEL FIELD_PREP(GNR_CFG_DW_RX_MASK, 0) From 0588504d28dedde6789aec17a6ece6fa8e477725 Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 4 Dec 2024 09:04:13 +0200 Subject: [PATCH 108/330] gpio: graniterapids: Determine if GPIO pad can be used by driver Add check of HOSTSW_MODE bit to determine if GPIO pad can be used by the driver. Cc: stable@vger.kernel.org Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-6-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index ec2931a65723b..b12abe77299c0 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -37,6 +37,7 @@ #define GNR_GPI_STATUS_OFFSET 0x14 #define GNR_GPI_ENABLE_OFFSET 0x24 +#define GNR_CFG_DW_HOSTSW_MODE BIT(27) #define GNR_CFG_DW_RX_MASK GENMASK(23, 22) #define GNR_CFG_DW_RX_DISABLE FIELD_PREP(GNR_CFG_DW_RX_MASK, 2) #define GNR_CFG_DW_RX_EDGE FIELD_PREP(GNR_CFG_DW_RX_MASK, 1) @@ -90,6 +91,20 @@ static int gnr_gpio_configure_line(struct gpio_chip *gc, unsigned int gpio, return 0; } +static int gnr_gpio_request(struct gpio_chip *gc, unsigned int gpio) +{ + struct gnr_gpio *priv = gpiochip_get_data(gc); + u32 dw; + + dw = readl(gnr_gpio_get_padcfg_addr(priv, gpio)); + if (!(dw & GNR_CFG_DW_HOSTSW_MODE)) { + dev_warn(gc->parent, "GPIO %u is not owned by host", gpio); + return -EBUSY; + } + + return 0; +} + static int gnr_gpio_get(struct gpio_chip *gc, unsigned int gpio) { const struct gnr_gpio *priv = gpiochip_get_data(gc); @@ -141,6 +156,7 @@ static int gnr_gpio_direction_output(struct gpio_chip *gc, unsigned int gpio, in static const struct gpio_chip gnr_gpio_chip = { .owner = THIS_MODULE, + .request = gnr_gpio_request, .get = gnr_gpio_get, .set = gnr_gpio_set, .get_direction = gnr_gpio_get_direction, From c0ec4890d6454980c53c3cc164140115c4a671f2 Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 4 Dec 2024 09:04:14 +0200 Subject: [PATCH 109/330] gpio: graniterapids: Check if GPIO line can be used for IRQs GPIO line can only be used as interrupt if its INTSEL register is programmed by the BIOS. Cc: stable@vger.kernel.org Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-7-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index b12abe77299c0..3a972d460fe2b 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -39,6 +39,7 @@ #define GNR_CFG_DW_HOSTSW_MODE BIT(27) #define GNR_CFG_DW_RX_MASK GENMASK(23, 22) +#define GNR_CFG_DW_INTSEL_MASK GENMASK(21, 14) #define GNR_CFG_DW_RX_DISABLE FIELD_PREP(GNR_CFG_DW_RX_MASK, 2) #define GNR_CFG_DW_RX_EDGE FIELD_PREP(GNR_CFG_DW_RX_MASK, 1) #define GNR_CFG_DW_RX_LEVEL FIELD_PREP(GNR_CFG_DW_RX_MASK, 0) @@ -227,10 +228,18 @@ static void gnr_gpio_irq_unmask(struct irq_data *d) static int gnr_gpio_irq_set_type(struct irq_data *d, unsigned int type) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - irq_hw_number_t pin = irqd_to_hwirq(d); - u32 mask = GNR_CFG_DW_RX_MASK; + struct gnr_gpio *priv = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + u32 reg; u32 set; + /* Allow interrupts only if Interrupt Select field is non-zero */ + reg = readl(gnr_gpio_get_padcfg_addr(priv, hwirq)); + if (!(reg & GNR_CFG_DW_INTSEL_MASK)) { + dev_dbg(gc->parent, "GPIO %lu cannot be used as IRQ", hwirq); + return -EPERM; + } + /* Falling edge and level low triggers not supported by the GPIO controller */ switch (type) { case IRQ_TYPE_NONE: @@ -248,7 +257,7 @@ static int gnr_gpio_irq_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } - return gnr_gpio_configure_line(gc, pin, mask, set); + return gnr_gpio_configure_line(gc, hwirq, GNR_CFG_DW_RX_MASK, set); } static const struct irq_chip gnr_gpio_irq_chip = { From 0bb18e34abdde7bf58fca8542e2dcf621924ea19 Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 4 Dec 2024 09:04:15 +0200 Subject: [PATCH 110/330] gpio: graniterapids: Fix GPIO Ack functionality Interrupt status (GPI_IS) register is cleared by writing 1 to it, not 0. Cc: stable@vger.kernel.org Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204070415.1034449-8-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index 3a972d460fe2b..ad6a045fd3d2d 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -185,7 +185,7 @@ static void gnr_gpio_irq_ack(struct irq_data *d) guard(raw_spinlock_irqsave)(&priv->lock); reg = readl(addr); - reg &= ~BIT(bit_idx); + reg |= BIT(bit_idx); writel(reg, addr); } From 9ac4b58fcef0f9fc03fa6e126a5f53c1c71ada8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 3 Dec 2024 18:26:30 +0100 Subject: [PATCH 111/330] gpio: idio-16: Actually make use of the GPIO_IDIO_16 symbol namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DEFAULT_SYMBOL_NAMESPACE must already be defined when is included. So move the define above the include block. Fixes: b9b1fc1ae119 ("gpio: idio-16: Introduce the ACCES IDIO-16 GPIO library module") Signed-off-by: Uwe Kleine-König Acked-by: William Breathitt Gray Link: https://lore.kernel.org/r/20241203172631.1647792-2-u.kleine-koenig@baylibre.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-idio-16.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-idio-16.c b/drivers/gpio/gpio-idio-16.c index 2c95125892972..0103be977c66b 100644 --- a/drivers/gpio/gpio-idio-16.c +++ b/drivers/gpio/gpio-idio-16.c @@ -3,6 +3,9 @@ * GPIO library for the ACCES IDIO-16 family * Copyright (C) 2022 William Breathitt Gray */ + +#define DEFAULT_SYMBOL_NAMESPACE "GPIO_IDIO_16" + #include #include #include @@ -14,8 +17,6 @@ #include "gpio-idio-16.h" -#define DEFAULT_SYMBOL_NAMESPACE "GPIO_IDIO_16" - #define IDIO_16_DAT_BASE 0x0 #define IDIO_16_OUT_BASE IDIO_16_DAT_BASE #define IDIO_16_IN_BASE (IDIO_16_DAT_BASE + 1) From b2e538a9827dd04ab5273bf4be8eb2edb84357b0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Dec 2024 10:56:12 +0100 Subject: [PATCH 112/330] ALSA: control: Avoid WARN() for symlink errors Using WARN() for showing the error of symlink creations don't give more information than telling that something goes wrong, since the usual code path is a lregister callback from each control element creation. More badly, the use of WARN() rather confuses fuzzer as if it were serious issues. This patch downgrades the warning messages to use the normal dev_err() instead of WARN(). For making it clearer, add the function name to the prefix, too. Fixes: a135dfb5de15 ("ALSA: led control - add sysfs kcontrol LED marking layer") Reported-by: syzbot+4e7919b09c67ffd198ae@syzkaller.appspotmail.com Closes: https://lore.kernel.org/675664c7.050a0220.a30f1.018c.GAE@google.com Link: https://patch.msgid.link/20241209095614.4273-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/control_led.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/core/control_led.c b/sound/core/control_led.c index 65a1ebe877768..e33dfcf863cf1 100644 --- a/sound/core/control_led.c +++ b/sound/core/control_led.c @@ -668,10 +668,16 @@ static void snd_ctl_led_sysfs_add(struct snd_card *card) goto cerr; led->cards[card->number] = led_card; snprintf(link_name, sizeof(link_name), "led-%s", led->name); - WARN(sysfs_create_link(&card->ctl_dev->kobj, &led_card->dev.kobj, link_name), - "can't create symlink to controlC%i device\n", card->number); - WARN(sysfs_create_link(&led_card->dev.kobj, &card->card_dev.kobj, "card"), - "can't create symlink to card%i\n", card->number); + if (sysfs_create_link(&card->ctl_dev->kobj, &led_card->dev.kobj, + link_name)) + dev_err(card->dev, + "%s: can't create symlink to controlC%i device\n", + __func__, card->number); + if (sysfs_create_link(&led_card->dev.kobj, &card->card_dev.kobj, + "card")) + dev_err(card->dev, + "%s: can't create symlink to card%i\n", + __func__, card->number); continue; cerr: From a412f04070e52e6d6b5f6f964b9d9644de16bb81 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 2 Dec 2024 15:28:22 +0900 Subject: [PATCH 113/330] openrisc: place exception table at the head of vmlinux Since commit 0043ecea2399 ("vmlinux.lds.h: Adjust symbol ordering in text output section"), the exception table in arch/openrisc/kernel/head.S is no longer positioned at the very beginning of the kernel image, which causes a boot failure. Currently, the exception table resides in the regular .text section. Previously, it was placed at the head by relying on the linker receiving arch/openrisc/kernel/head.o as the first object. However, this behavior has changed because sections like .text.{asan,unknown,unlikely,hot} now precede the regular .text section. The .head.text section is intended for entry points requiring special placement. However, in OpenRISC, this section has been misused: instead of the entry points, it contains boot code meant to be discarded after booting. This feature is typically handled by the .init.text section. This commit addresses the issue by replacing the current __HEAD marker with __INIT and re-annotating the entry points with __HEAD. Additionally, it adds __REF to entry.S to suppress the following modpost warning: WARNING: modpost: vmlinux: section mismatch in reference: _tng_kernel_start+0x70 (section: .text) -> _start (section: .init.text) Fixes: 0043ecea2399 ("vmlinux.lds.h: Adjust symbol ordering in text output section") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/all/5e032233-5b65-4ad5-ac50-d2eb6c00171c@roeck-us.net/#t Signed-off-by: Masahiro Yamada Tested-by: Guenter Roeck Reviewed-by: Rong Xu Signed-off-by: Stafford Horne --- arch/openrisc/kernel/entry.S | 2 ++ arch/openrisc/kernel/head.S | 6 ++++-- arch/openrisc/kernel/vmlinux.lds.S | 3 +-- scripts/head-object-list.txt | 1 - 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index 440711d7bf40e..ce6f2b08a35ed 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -239,6 +239,8 @@ handler: ;\ /* =====================================================[ exceptions] === */ + __REF + /* ---[ 0x100: RESET exception ]----------------------------------------- */ EXCEPTION_ENTRY(_tng_kernel_start) diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index 439e00f81e5dd..ec6d2a7d5b928 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -357,6 +357,8 @@ /* =====================================================[ exceptions] === */ + __HEAD + /* ---[ 0x100: RESET exception ]----------------------------------------- */ .org 0x100 /* Jump to .init code at _start which lives in the .head section @@ -506,10 +508,10 @@ _dispatch_do_ipage_fault: /* .text*/ -/* This early stuff belongs in HEAD, but some of the functions below definitely +/* This early stuff belongs in the .init.text section, but some of the functions below definitely * don't... */ - __HEAD + __INIT .global _start _start: /* Init r0 to zero as per spec */ diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index bc13060478373..049bff45f6126 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { _stext = .; + HEAD_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT @@ -83,8 +84,6 @@ SECTIONS . = ALIGN(PAGE_SIZE); __init_begin = .; - HEAD_TEXT_SECTION - /* Page aligned */ INIT_TEXT_SECTION(PAGE_SIZE) diff --git a/scripts/head-object-list.txt b/scripts/head-object-list.txt index f12b4a7b8406b..7274dfc65af60 100644 --- a/scripts/head-object-list.txt +++ b/scripts/head-object-list.txt @@ -24,7 +24,6 @@ arch/m68k/kernel/head.o arch/m68k/kernel/sun3-head.o arch/microblaze/kernel/head.o arch/nios2/kernel/head.o -arch/openrisc/kernel/head.o arch/parisc/kernel/head.o arch/powerpc/kernel/head_44x.o arch/powerpc/kernel/head_64.o From c8f8d4344d50d72181207ee73175bba567c25f58 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Dec 2024 14:04:26 +0100 Subject: [PATCH 114/330] openrisc: Fix misalignments in head.S Align all line continuations and (sub)section headers in a consistent way. Signed-off-by: Geert Uytterhoeven Signed-off-by: Stafford Horne --- arch/openrisc/kernel/head.S | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index ec6d2a7d5b928..bd760066f1cdc 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -26,15 +26,15 @@ #include #include -#define tophys(rd,rs) \ - l.movhi rd,hi(-KERNELBASE) ;\ +#define tophys(rd,rs) \ + l.movhi rd,hi(-KERNELBASE) ;\ l.add rd,rd,rs -#define CLEAR_GPR(gpr) \ +#define CLEAR_GPR(gpr) \ l.movhi gpr,0x0 -#define LOAD_SYMBOL_2_GPR(gpr,symbol) \ - l.movhi gpr,hi(symbol) ;\ +#define LOAD_SYMBOL_2_GPR(gpr,symbol) \ + l.movhi gpr,hi(symbol) ;\ l.ori gpr,gpr,lo(symbol) @@ -326,21 +326,21 @@ l.addi r1,r1,-(INT_FRAME_SIZE) ;\ /* r1 is KSP, r30 is __pa(KSP) */ ;\ tophys (r30,r1) ;\ - l.sw PT_GPR12(r30),r12 ;\ + l.sw PT_GPR12(r30),r12 ;\ l.mfspr r12,r0,SPR_EPCR_BASE ;\ l.sw PT_PC(r30),r12 ;\ l.mfspr r12,r0,SPR_ESR_BASE ;\ l.sw PT_SR(r30),r12 ;\ /* save r31 */ ;\ EXCEPTION_T_LOAD_GPR30(r12) ;\ - l.sw PT_GPR30(r30),r12 ;\ + l.sw PT_GPR30(r30),r12 ;\ /* save r10 as was prior to exception */ ;\ EXCEPTION_T_LOAD_GPR10(r12) ;\ - l.sw PT_GPR10(r30),r12 ;\ - /* save PT_SP as was prior to exception */ ;\ + l.sw PT_GPR10(r30),r12 ;\ + /* save PT_SP as was prior to exception */ ;\ EXCEPTION_T_LOAD_SP(r12) ;\ l.sw PT_SP(r30),r12 ;\ - l.sw PT_GPR13(r30),r13 ;\ + l.sw PT_GPR13(r30),r13 ;\ /* --> */ ;\ /* save exception r4, set r4 = EA */ ;\ l.sw PT_GPR4(r30),r4 ;\ @@ -396,7 +396,7 @@ _dispatch_do_ipage_fault: .org 0x500 EXCEPTION_HANDLE(_timer_handler) -/* ---[ 0x600: Alignment exception ]-------------------------------------- */ +/* ---[ 0x600: Alignment exception ]------------------------------------- */ .org 0x600 EXCEPTION_HANDLE(_alignment_handler) @@ -426,7 +426,7 @@ _dispatch_do_ipage_fault: .org 0xc00 EXCEPTION_HANDLE(_sys_call_handler) -/* ---[ 0xd00: Floating point exception ]--------------------------------- */ +/* ---[ 0xd00: Floating point exception ]-------------------------------- */ .org 0xd00 EXCEPTION_HANDLE(_fpe_trap_handler) @@ -818,7 +818,7 @@ secondary_start: #endif -/* ========================================[ cache ]=== */ +/* ==========================================================[ cache ]=== */ /* alignment here so we don't change memory offsets with * memory controller defined From 984795e76def5c903724b8d6a8228e356bbdf2af Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Tue, 10 Dec 2024 14:40:25 +0530 Subject: [PATCH 115/330] ASoC: amd: yc: Fix the wrong return value With the current implementation, when ACP driver fails to read ACPI _WOV entry then the DMI overrides code won't invoke, may cause regressions for some BIOS versions. Add a condition check to jump to check the DMI entries incase of ACP driver fail to read ACPI _WOV method. Fixes: 4095cf872084 (ASoC: amd: yc: Fix for enabling DMIC on acp6x via _DSD entry) Signed-off-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20241210091026.996860-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index e38c5885dadfb..ecf57a6cb7c37 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -578,14 +578,19 @@ static int acp6x_probe(struct platform_device *pdev) handle = ACPI_HANDLE(pdev->dev.parent); ret = acpi_evaluate_integer(handle, "_WOV", NULL, &dmic_status); - if (!ACPI_FAILURE(ret)) + if (!ACPI_FAILURE(ret)) { wov_en = dmic_status; + if (!wov_en) + return -ENODEV; + } else { + /* Incase of ACPI method read failure then jump to check_dmi_entry */ + goto check_dmi_entry; + } - if (is_dmic_enable && wov_en) + if (is_dmic_enable) platform_set_drvdata(pdev, &acp6x_card); - else - return 0; +check_dmi_entry: /* check for any DMI overrides */ dmi_id = dmi_first_match(yc_acp_quirk_table); if (dmi_id) From 687630aa582acf674120c87350beb01d836c837c Mon Sep 17 00:00:00 2001 From: Stephen Gordon Date: Sat, 7 Dec 2024 23:22:56 +1100 Subject: [PATCH 116/330] ASoC: audio-graph-card: Call of_node_put() on correct node Signed-off-by: Stephen Gordon Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20241207122257.165096-1-gordoste@iinet.net.au Signed-off-by: Mark Brown --- sound/soc/generic/audio-graph-card2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/generic/audio-graph-card2.c b/sound/soc/generic/audio-graph-card2.c index 5280c1b20d85e..1f5c4e8ff1b90 100644 --- a/sound/soc/generic/audio-graph-card2.c +++ b/sound/soc/generic/audio-graph-card2.c @@ -771,7 +771,7 @@ static void graph_link_init(struct simple_util_priv *priv, of_node_get(port_codec); if (graph_lnk_is_multi(port_codec)) { ep_codec = graph_get_next_multi_ep(&port_codec); - of_node_put(port_cpu); + of_node_put(port_codec); port_codec = ep_to_port(ep_codec); } else { ep_codec = of_graph_get_next_port_endpoint(port_codec, NULL); From c1043cdb019ed4d053d673e62b553a5cea1a287d Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 7 Dec 2024 21:26:55 -0300 Subject: [PATCH 117/330] alienware-wmi: Fix X Series and G Series quirks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices that are known to support the WMI thermal interface do not support the legacy LED control interface. Make `.num_zones = 0` and avoid calling alienware_zone_init() if that's the case. Fixes: 9f6c43041552 ("alienware-wmi: added platform profile support") Fixes: 1c1eb70e7d23 ("alienware-wmi: extends the list of supported models") Suggested-by: Armin Wolf Reviewed-by: Armin Wolf Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20241208002652.5885-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index 77465ed9b449b..e69bf9a7b6c84 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -190,7 +190,7 @@ static struct quirk_entry quirk_asm201 = { }; static struct quirk_entry quirk_g_series = { - .num_zones = 2, + .num_zones = 0, .hdmi_mux = 0, .amplifier = 0, .deepslp = 0, @@ -199,7 +199,7 @@ static struct quirk_entry quirk_g_series = { }; static struct quirk_entry quirk_x_series = { - .num_zones = 2, + .num_zones = 0, .hdmi_mux = 0, .amplifier = 0, .deepslp = 0, @@ -687,6 +687,9 @@ static void alienware_zone_exit(struct platform_device *dev) { u8 zone; + if (!quirks->num_zones) + return; + sysfs_remove_group(&dev->dev.kobj, &zone_attribute_group); led_classdev_unregister(&global_led); if (zone_dev_attrs) { @@ -1229,9 +1232,11 @@ static int __init alienware_wmi_init(void) goto fail_prep_thermal_profile; } - ret = alienware_zone_init(platform_device); - if (ret) - goto fail_prep_zones; + if (quirks->num_zones > 0) { + ret = alienware_zone_init(platform_device); + if (ret) + goto fail_prep_zones; + } return 0; From 54a8cada2f3d7efb4a7920807473d89c442d9c45 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 7 Dec 2024 21:30:15 -0300 Subject: [PATCH 118/330] alienware-wmi: Adds support to Alienware m16 R1 AMD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds support to Alienware m16 R1 AMD. Tested-by: Cihan Ozakca Signed-off-by: Kurt Borja Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241208003013.6490-3-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index e69bf9a7b6c84..341d01d3e3e4c 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -241,6 +241,15 @@ static const struct dmi_system_id alienware_quirks[] __initconst = { }, .driver_data = &quirk_asm201, }, + { + .callback = dmi_matched, + .ident = "Alienware m16 R1 AMD", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"), + }, + .driver_data = &quirk_x_series, + }, { .callback = dmi_matched, .ident = "Alienware m17 R5", From 9244524d60ddea55f4df54c51200e8fef2032447 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:33 +0900 Subject: [PATCH 119/330] p2sb: Factor out p2sb_read_from_cache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, factor out the code to read the P2SB resource from the cache to the new function p2sb_read_from_cache(). Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-2-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index d51eb0db06264..a685781d1272c 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -172,6 +172,22 @@ static int p2sb_cache_resources(void) return ret; } +static int p2sb_read_from_cache(struct pci_bus *bus, unsigned int devfn, + struct resource *mem) +{ + struct p2sb_res_cache *cache = &p2sb_resources[PCI_FUNC(devfn)]; + + if (cache->bus_dev_id != bus->dev.id) + return -ENODEV; + + if (!p2sb_valid_resource(&cache->res)) + return -ENOENT; + + memcpy(mem, &cache->res, sizeof(*mem)); + + return 0; +} + /** * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR * @bus: PCI bus to communicate with @@ -188,8 +204,6 @@ static int p2sb_cache_resources(void) */ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) { - struct p2sb_res_cache *cache; - bus = p2sb_get_bus(bus); if (!bus) return -ENODEV; @@ -197,15 +211,7 @@ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) if (!devfn) p2sb_get_devfn(&devfn); - cache = &p2sb_resources[PCI_FUNC(devfn)]; - if (cache->bus_dev_id != bus->dev.id) - return -ENODEV; - - if (!p2sb_valid_resource(&cache->res)) - return -ENOENT; - - memcpy(mem, &cache->res, sizeof(*mem)); - return 0; + return p2sb_read_from_cache(bus, devfn, mem); } EXPORT_SYMBOL_GPL(p2sb_bar); From ae3e6ebc5ab046d434c05c58a3e3f7e94441fec2 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:34 +0900 Subject: [PATCH 120/330] p2sb: Introduce the global flag p2sb_hidden_by_bios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, introduce the global flag p2sb_hidden_by_bios. Check if the BIOS hides the P2SB device and store the result in the flag. This allows to refer to the check result across functions. Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-3-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index a685781d1272c..630068e01f7e0 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -43,6 +43,7 @@ struct p2sb_res_cache { }; static struct p2sb_res_cache p2sb_resources[NR_P2SB_RES_CACHE]; +static bool p2sb_hidden_by_bios; static void p2sb_get_devfn(unsigned int *devfn) { @@ -158,13 +159,14 @@ static int p2sb_cache_resources(void) * Unhide the P2SB device here, if needed. */ pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); - if (value & P2SBC_HIDE) + p2sb_hidden_by_bios = value & P2SBC_HIDE; + if (p2sb_hidden_by_bios) pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0); ret = p2sb_scan_and_cache(bus, devfn_p2sb); /* Hide the P2SB device, if it was hidden */ - if (value & P2SBC_HIDE) + if (p2sb_hidden_by_bios) pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE); pci_unlock_rescan_remove(); From 0286070c74ee48391fc07f7f617460479472d221 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:35 +0900 Subject: [PATCH 121/330] p2sb: Move P2SB hide and unhide code to p2sb_scan_and_cache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, move the code to hide and unhide the P2SB device from p2sb_cache_resources() to p2sb_scan_and_cache(). Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-4-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index 630068e01f7e0..46c108bbcbbac 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -98,6 +98,14 @@ static void p2sb_scan_and_cache_devfn(struct pci_bus *bus, unsigned int devfn) static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) { + /* + * The BIOS prevents the P2SB device from being enumerated by the PCI + * subsystem, so we need to unhide and hide it back to lookup the BAR. + * Unhide the P2SB device here, if needed. + */ + if (p2sb_hidden_by_bios) + pci_bus_write_config_dword(bus, devfn, P2SBC, 0); + /* Scan the P2SB device and cache its BAR0 */ p2sb_scan_and_cache_devfn(bus, devfn); @@ -105,6 +113,10 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) if (devfn == P2SB_DEVFN_GOLDMONT) p2sb_scan_and_cache_devfn(bus, SPI_DEVFN_GOLDMONT); + /* Hide the P2SB device, if it was hidden */ + if (p2sb_hidden_by_bios) + pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); + if (!p2sb_valid_resource(&p2sb_resources[PCI_FUNC(devfn)].res)) return -ENOENT; @@ -153,22 +165,11 @@ static int p2sb_cache_resources(void) */ pci_lock_rescan_remove(); - /* - * The BIOS prevents the P2SB device from being enumerated by the PCI - * subsystem, so we need to unhide and hide it back to lookup the BAR. - * Unhide the P2SB device here, if needed. - */ pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); p2sb_hidden_by_bios = value & P2SBC_HIDE; - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0); ret = p2sb_scan_and_cache(bus, devfn_p2sb); - /* Hide the P2SB device, if it was hidden */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE); - pci_unlock_rescan_remove(); return ret; From 360c400d0f568636c1b98d1d5f9f49aa3d420c70 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:36 +0900 Subject: [PATCH 122/330] p2sb: Do not scan and remove the P2SB device when it is unhidden MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When drivers access P2SB device resources, it calls p2sb_bar(). Before the commit 5913320eb0b3 ("platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe"), p2sb_bar() obtained the resources and then called pci_stop_and_remove_bus_device() for clean up. Then the P2SB device disappeared. The commit 5913320eb0b3 introduced the P2SB device resource cache feature in the boot process. During the resource cache, pci_stop_and_remove_bus_device() is called for the P2SB device, then the P2SB device disappears regardless of whether p2sb_bar() is called or not. Such P2SB device disappearance caused a confusion [1]. To avoid the confusion, avoid the pci_stop_and_remove_bus_device() call when the BIOS does not hide the P2SB device. For that purpose, cache the P2SB device resources only if the BIOS hides the P2SB device. Call p2sb_scan_and_cache() only if p2sb_hidden_by_bios is true. This allows removing two branches from p2sb_scan_and_cache(). When p2sb_bar() is called, get the resources from the cache if the P2SB device is hidden. Otherwise, read the resources from the unhidden P2SB device. Reported-by: Daniel Walker (danielwa) Closes: https://lore.kernel.org/lkml/ZzTI+biIUTvFT6NC@goliath/ [1] Fixes: 5913320eb0b3 ("platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe") Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-5-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 42 +++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index 46c108bbcbbac..cbbb0f809704e 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -101,10 +101,8 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) /* * The BIOS prevents the P2SB device from being enumerated by the PCI * subsystem, so we need to unhide and hide it back to lookup the BAR. - * Unhide the P2SB device here, if needed. */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn, P2SBC, 0); + pci_bus_write_config_dword(bus, devfn, P2SBC, 0); /* Scan the P2SB device and cache its BAR0 */ p2sb_scan_and_cache_devfn(bus, devfn); @@ -113,9 +111,7 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) if (devfn == P2SB_DEVFN_GOLDMONT) p2sb_scan_and_cache_devfn(bus, SPI_DEVFN_GOLDMONT); - /* Hide the P2SB device, if it was hidden */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); + pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); if (!p2sb_valid_resource(&p2sb_resources[PCI_FUNC(devfn)].res)) return -ENOENT; @@ -142,7 +138,7 @@ static int p2sb_cache_resources(void) u32 value = P2SBC_HIDE; struct pci_bus *bus; u16 class; - int ret; + int ret = 0; /* Get devfn for P2SB device itself */ p2sb_get_devfn(&devfn_p2sb); @@ -168,7 +164,12 @@ static int p2sb_cache_resources(void) pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); p2sb_hidden_by_bios = value & P2SBC_HIDE; - ret = p2sb_scan_and_cache(bus, devfn_p2sb); + /* + * If the BIOS does not hide the P2SB device then its resources + * are accesilble. Cache them only if the P2SB device is hidden. + */ + if (p2sb_hidden_by_bios) + ret = p2sb_scan_and_cache(bus, devfn_p2sb); pci_unlock_rescan_remove(); @@ -191,6 +192,26 @@ static int p2sb_read_from_cache(struct pci_bus *bus, unsigned int devfn, return 0; } +static int p2sb_read_from_dev(struct pci_bus *bus, unsigned int devfn, + struct resource *mem) +{ + struct pci_dev *pdev; + int ret = 0; + + pdev = pci_get_slot(bus, devfn); + if (!pdev) + return -ENODEV; + + if (p2sb_valid_resource(pci_resource_n(pdev, 0))) + p2sb_read_bar0(pdev, mem); + else + ret = -ENOENT; + + pci_dev_put(pdev); + + return ret; +} + /** * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR * @bus: PCI bus to communicate with @@ -214,7 +235,10 @@ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) if (!devfn) p2sb_get_devfn(&devfn); - return p2sb_read_from_cache(bus, devfn, mem); + if (p2sb_hidden_by_bios) + return p2sb_read_from_cache(bus, devfn, mem); + + return p2sb_read_from_dev(bus, devfn, mem); } EXPORT_SYMBOL_GPL(p2sb_bar); From 41856638e6c4ed51d8aa9e54f70059d1e357b46e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 29 Nov 2024 17:39:27 +0100 Subject: [PATCH 123/330] s390/mm: Fix DirectMap accounting With uncoupling of physical and virtual address spaces population of the identity mapping was changed to use the type POPULATE_IDENTITY instead of POPULATE_DIRECT. This breaks DirectMap accounting: > cat /proc/meminfo DirectMap4k: 55296 kB DirectMap1M: 18446744073709496320 kB Adjust all locations of update_page_count() in vmem.c to use POPULATE_IDENTITY instead of POPULATE_DIRECT as well. With this accounting is correct again: > cat /proc/meminfo DirectMap4k: 54264 kB DirectMap1M: 8334336 kB Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Cc: stable@vger.kernel.org Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/vmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 145035f84a0e3..3fa28db2fe59f 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -306,7 +306,7 @@ static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long e pages++; } } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_4K, pages); } @@ -339,7 +339,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e } pgtable_pte_populate(pmd, addr, next, mode); } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_1M, pages); } @@ -372,7 +372,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e } pgtable_pmd_populate(pud, addr, next, mode); } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_2G, pages); } From 7363f2d4c18557c99c536b70489187bb4e05c412 Mon Sep 17 00:00:00 2001 From: Vladimir Riabchun Date: Sat, 7 Dec 2024 00:19:34 +0100 Subject: [PATCH 124/330] i2c: pnx: Fix timeout in wait functions Since commit f63b94be6942 ("i2c: pnx: Fix potential deadlock warning from del_timer_sync() call in isr") jiffies are stored in i2c_pnx_algo_data.timeout, but wait_timeout and wait_reset are still using it as milliseconds. Convert jiffies back to milliseconds to wait for the expected amount of time. Fixes: f63b94be6942 ("i2c: pnx: Fix potential deadlock warning from del_timer_sync() call in isr") Signed-off-by: Vladimir Riabchun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-pnx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c index d4d139b975131..9a1af5bbd604c 100644 --- a/drivers/i2c/busses/i2c-pnx.c +++ b/drivers/i2c/busses/i2c-pnx.c @@ -95,7 +95,7 @@ enum { static inline int wait_timeout(struct i2c_pnx_algo_data *data) { - long timeout = data->timeout; + long timeout = jiffies_to_msecs(data->timeout); while (timeout > 0 && (ioread32(I2C_REG_STS(data)) & mstatus_active)) { mdelay(1); @@ -106,7 +106,7 @@ static inline int wait_timeout(struct i2c_pnx_algo_data *data) static inline int wait_reset(struct i2c_pnx_algo_data *data) { - long timeout = data->timeout; + long timeout = jiffies_to_msecs(data->timeout); while (timeout > 0 && (ioread32(I2C_REG_CTL(data)) & mcntrl_reset)) { mdelay(1); From a56335c85b592cb2833db0a71f7112b7d9f0d56b Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Mon, 9 Dec 2024 15:40:09 +0530 Subject: [PATCH 125/330] mmc: sdhci-tegra: Remove SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC quirk Value 0 in ADMA length descriptor is interpreted as 65536 on new Tegra chips, remove SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC quirk to make sure max ADMA2 length is 65536. Fixes: 4346b7c7941d ("mmc: tegra: Add Tegra186 support") Cc: stable@vger.kernel.org Signed-off-by: Prathamesh Shete Acked-by: Thierry Reding Acked-by: Adrian Hunter Message-ID: <20241209101009.22710-1-pshete@nvidia.com> Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 4d402b6018836..b2f5c3f8b8396 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1525,7 +1525,6 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | SDHCI_QUIRK_SINGLE_POWER_WRITE | SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | SDHCI_QUIRK2_ISSUE_CMD_DAT_RESET_TOGETHER, From f3d87abe11ed04d1b23a474a212f0e5deeb50892 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 3 Dec 2024 11:34:42 +0900 Subject: [PATCH 126/330] mmc: mtk-sd: disable wakeup in .remove() and in the error path of .probe() Current implementation leaves pdev->dev as a wakeup source. Add a device_init_wakeup(&pdev->dev, false) call in the .remove() function and in the error path of the .probe() function. Signed-off-by: Joe Hattori Fixes: 527f36f5efa4 ("mmc: mediatek: add support for SDIO eint wakup IRQ") Cc: stable@vger.kernel.org Message-ID: <20241203023442.2434018-1-joe@pf.is.s.u-tokyo.ac.jp> Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index efb0d2d5716b9..af445d3f8e2ae 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -3070,6 +3070,7 @@ static int msdc_drv_probe(struct platform_device *pdev) msdc_gate_clock(host); platform_set_drvdata(pdev, NULL); release_mem: + device_init_wakeup(&pdev->dev, false); if (host->dma.gpd) dma_free_coherent(&pdev->dev, 2 * sizeof(struct mt_gpdma_desc), @@ -3103,6 +3104,7 @@ static void msdc_drv_remove(struct platform_device *pdev) host->dma.gpd, host->dma.gpd_addr); dma_free_coherent(&pdev->dev, MAX_BD_NUM * sizeof(struct mt_bdma_desc), host->dma.bd, host->dma.bd_addr); + device_init_wakeup(&pdev->dev, false); } static void msdc_save_reg(struct msdc_host *host) From 5751eee5c620c7e47a277ca929c4010caf24654c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 6 Dec 2024 09:28:06 +0100 Subject: [PATCH 127/330] i2c: nomadik: Add missing sentinel to match table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OF match table is not NULL-terminated. Fix this by adding a sentinel to nmk_i2c_eyeq_match_table[]. Fixes: a0d15cc47f29be6d ("i2c: nomadik: switch from of_device_is_compatible() to of_match_device()") Signed-off-by: Geert Uytterhoeven Reviewed-by: Théo Lebrun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index efb33802804fd..d2877e4cc28d4 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -1075,6 +1075,7 @@ static const struct of_device_id nmk_i2c_eyeq_match_table[] = { .compatible = "mobileye,eyeq6h-i2c", .data = (void *)NMK_I2C_EYEQ_FLAG_32B_BUS, }, + { /* sentinel */ } }; static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) From a592bb19abdc2072875c87da606461bfd7821b08 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 26 Nov 2024 12:10:59 -0500 Subject: [PATCH 128/330] drm/amdkfd: Dereference null return value In the function pqm_uninit there is a call-assignment of "pdd = kfd_get_process_device_data" which could be null, and this value was later dereferenced without checking. Fixes: fb91065851cd ("drm/amdkfd: Refactor queue wptr_bo GART mapping") Signed-off-by: Andrew Martin Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index c76db22a10005..59b92d66e9589 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -212,13 +212,17 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, void pqm_uninit(struct process_queue_manager *pqm) { struct process_queue_node *pqn, *next; - struct kfd_process_device *pdd; list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { if (pqn->q) { - pdd = kfd_get_process_device_data(pqn->q->device, pqm->process); - kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); - kfd_queue_release_buffers(pdd, &pqn->q->properties); + struct kfd_process_device *pdd = kfd_get_process_device_data(pqn->q->device, + pqm->process); + if (pdd) { + kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); + kfd_queue_release_buffers(pdd, &pqn->q->properties); + } else { + WARN_ON(!pdd); + } pqm_clean_queue_resource(pqm, pqn); } From 321048c4a3e375416b51b4093978f9ce2aa4d391 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 27 Nov 2024 14:01:35 -0500 Subject: [PATCH 129/330] drm/amdkfd: hard-code cacheline size for gfx11 This information is not available in ip discovery table. Signed-off-by: Harish Kasiviswanathan Reviewed-by: David Belanger Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 7b826a136ceb9..d73510c522868 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1423,6 +1423,7 @@ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, + bool cache_line_size_missing, struct kfd_gpu_cache_info *pcache_info) { struct amdgpu_device *adev = kdev->adev; @@ -1437,6 +1438,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* Scalar L1 Instruction Cache per SQC */ @@ -1449,6 +1452,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* Scalar L1 Data Cache per SQC */ @@ -1460,6 +1465,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 64; i++; } /* GL1 Data Cache per SA */ @@ -1472,7 +1479,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; - pcache_info[i].cache_line_size = 0; + if (cache_line_size_missing) + pcache_info[i].cache_line_size = 128; i++; } /* L2 Data Cache per GPU (Total Tex Cache) */ @@ -1484,6 +1492,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* L3 Data Cache per GPU */ @@ -1569,6 +1579,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info) { int num_of_cache_types = 0; + bool cache_line_size_missing = false; switch (kdev->adev->asic_type) { case CHIP_KAVERI: @@ -1692,10 +1703,17 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): case IP_VERSION(11, 5, 2): + /* Cacheline size not available in IP discovery for gc11. + * kfd_fill_gpu_cache_info_from_gfx_config to hard code it + */ + cache_line_size_missing = true; + fallthrough; case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): num_of_cache_types = - kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, *pcache_info); + kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, + cache_line_size_missing, + *pcache_info); break; default: *pcache_info = dummy_cache_info; From d50bf3f0fab636574c163ba8b5863e12b1ed19bd Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Thu, 28 Nov 2024 11:07:57 -0500 Subject: [PATCH 130/330] drm/amdkfd: hard-code MALL cacheline size for gfx11, gfx12 This information is not available in ip discovery table. Signed-off-by: Harish Kasiviswanathan Reviewed-by: David Belanger Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index d73510c522868..e5324c5bc6c71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1504,7 +1504,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; - pcache_info[i].cache_line_size = 0; + pcache_info[i].cache_line_size = 64; i++; } return i; From ee2003d5fd139f5c881b87615c216c0053b69093 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Thu, 5 Dec 2024 11:27:36 -0500 Subject: [PATCH 131/330] drm/amdgpu: Fix ISP HW init issue ISP hw_init is not called with the recent changes related to hw init levels. AMDGPU_INIT_LEVEL_DEFAULT is ignoring the ISP IP block as AMDGPU_IP_BLK_MASK_ALL is derived using incorrect max number of IP blocks. Update AMDGPU_IP_BLK_MASK_ALL to use AMD_IP_BLOCK_TYPE_NUM instead of AMDGPU_MAX_IP_NUM to fix the issue. Fixes: 14f2fe34f5c6 ("drm/amdgpu: Add init levels") Reviewed-by: Lijo Lazar Signed-off-by: Pratap Nirujogi Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 23ad65e336d51..d272d95dd5b2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -145,7 +145,7 @@ const char *amdgpu_asic_name[] = { "LAST", }; -#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) +#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) /* * Default init level where all blocks are expected to be initialized. This is * the level of initialization expected by default and also after a full reset From f4df208177d02f1c90f3644da3a2453080b8c24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 6 Dec 2024 14:46:06 +0100 Subject: [PATCH 132/330] drm/amdgpu: fix when the cleaner shader is emitted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Emitting the cleaner shader must come after the check if a VM switch is necessary or not. Otherwise we will emit the cleaner shader every time and not just when it is necessary because we switched between applications. This can otherwise crash on gang submit and probably decreases performance quite a bit. v2: squash in fix from Srini (Alex) Signed-off-by: Christian König Fixes: ee7a846ea27b ("drm/amdgpu: Emit cleaner shader at end of IB submission") Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 8d9bf7a0857fd..ddd7f05e4db9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -674,12 +674,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping && ring->funcs->emit_wreg; - if (adev->gfx.enable_cleaner_shader && - ring->funcs->emit_cleaner_shader && - job->enforce_isolation) - ring->funcs->emit_cleaner_shader(ring); - - if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync) + if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync && + !(job->enforce_isolation && !job->vmid)) return 0; amdgpu_ring_ib_begin(ring); @@ -690,6 +686,11 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, if (need_pipe_sync) amdgpu_ring_emit_pipeline_sync(ring); + if (adev->gfx.enable_cleaner_shader && + ring->funcs->emit_cleaner_shader && + job->enforce_isolation) + ring->funcs->emit_cleaner_shader(ring); + if (vm_flush_needed) { trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr); amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr); From 438b39ac74e2a9dc0a5c9d653b7d8066877e86b1 Mon Sep 17 00:00:00 2001 From: "Jesse.zhang@amd.com" Date: Thu, 5 Dec 2024 17:41:26 +0800 Subject: [PATCH 133/330] drm/amdkfd: pause autosuspend when creating pdd When using MES creating a pdd will require talking to the GPU to setup the relevant context. The code here forgot to wake up the GPU in case it was in suspend, this causes KVM to EFAULT for passthrough GPU for example. This issue can be masked if the GPU was woken up by other things (e.g. opening the KMS node) first and have not yet gone to sleep. v4: do the allocation of proc_ctx_bo in a lazy fashion when the first queue is created in a process (Felix) Signed-off-by: Jesse Zhang Reviewed-by: Yunxiang Li Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 15 ++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 23 ++----------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c79fe9069e220..16b5daaa272f1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -207,6 +207,21 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!pdd->proc_ctx_cpu_ptr) { + r = amdgpu_amdkfd_alloc_gtt_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (r) { + dev_err(adev->dev, + "failed to allocate process context bo\n"); + return r; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); queue_input.process_id = qpd->pqm->process->pasid; queue_input.page_table_base_addr = qpd->page_table_base; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 87cd52cf4ee99..d0ee173acf824 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1076,7 +1076,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) kfd_free_process_doorbells(pdd->dev->kfd, pdd); - if (pdd->dev->kfd->shared_resources.enable_mes) + if (pdd->dev->kfd->shared_resources.enable_mes && + pdd->proc_ctx_cpu_ptr) amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev, &pdd->proc_ctx_bo); /* @@ -1608,7 +1609,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, struct kfd_process *p) { struct kfd_process_device *pdd = NULL; - int retval = 0; if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE)) return NULL; @@ -1632,21 +1632,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->user_gpu_id = dev->id; atomic64_set(&pdd->evict_duration_counter, 0); - if (dev->kfd->shared_resources.enable_mes) { - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, - AMDGPU_MES_PROC_CTX_SIZE, - &pdd->proc_ctx_bo, - &pdd->proc_ctx_gpu_addr, - &pdd->proc_ctx_cpu_ptr, - false); - if (retval) { - dev_err(dev->adev->dev, - "failed to allocate process context bo\n"); - goto err_free_pdd; - } - memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); - } - p->pdds[p->n_pdds++] = pdd; if (kfd_dbg_is_per_vmid_supported(pdd->dev)) pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap( @@ -1658,10 +1643,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, idr_init(&pdd->alloc_idr); return pdd; - -err_free_pdd: - kfree(pdd); - return NULL; } /** From cae005670887cb07ceafc25bb32e221e56286488 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:54 +0900 Subject: [PATCH 134/330] block: Use a zone write plug BIO work for REQ_NOWAIT BIOs For zoned block devices, a write BIO issued to a zone that has no on-going writes will be prepared for execution and allowed to execute immediately by blk_zone_wplug_handle_write() (called from blk_zone_plug_bio()). However, if this BIO specifies REQ_NOWAIT, the allocation of a request for its execution in blk_mq_submit_bio() may fail after blk_zone_plug_bio() completed, marking the target zone of the BIO as plugged. When this BIO is retried later on, it will be blocked as the zone write plug of the target zone is in a plugged state without any on-going write operation (completion of write operations trigger unplugging of the next write BIOs for a zone). This leads to a BIO that is stuck in a zone write plug and never completes, which results in various issues such as hung tasks. Avoid this problem by always executing REQ_NOWAIT write BIOs using the BIO work of a zone write plug. This ensure that we never block the BIO issuer and can thus safely ignore the REQ_NOWAIT flag when executing the BIO from the zone write plug BIO work. Since such BIO may be the first write BIO issued to a zone with no on-going write, modify disk_zone_wplug_add_bio() to schedule the zone write plug BIO work if the write plug is not already marked with the BLK_ZONE_WPLUG_PLUGGED flag. This scheduling is otherwise not necessary as the completion of the on-going write for the zone will schedule the execution of the next plugged BIOs. blk_zone_wplug_handle_write() is also fixed to better handle zone write plug allocation failures for REQ_NOWAIT BIOs by failing a write BIO using bio_wouldblock_error() instead of bio_io_error(). Reported-by: Bart Van Assche Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 62 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 263e28b720538..7982b9494d633 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -746,9 +746,25 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio) return false; } -static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, - struct bio *bio, unsigned int nr_segs) +static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* + * Take a reference on the zone write plug and schedule the submission + * of the next plugged BIO. blk_zone_wplug_bio_work() will release the + * reference we take here. + */ + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + refcount_inc(&zwplug->ref); + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); +} + +static inline void disk_zone_wplug_add_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug, + struct bio *bio, unsigned int nr_segs) { + bool schedule_bio_work = false; + /* * Grab an extra reference on the BIO request queue usage counter. * This reference will be reused to submit a request for the BIO for @@ -764,6 +780,16 @@ static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, */ bio_clear_polled(bio); + /* + * REQ_NOWAIT BIOs are always handled using the zone write plug BIO + * work, which can block. So clear the REQ_NOWAIT flag and schedule the + * work if this is the first BIO we are plugging. + */ + if (bio->bi_opf & REQ_NOWAIT) { + schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); + bio->bi_opf &= ~REQ_NOWAIT; + } + /* * Reuse the poll cookie field to store the number of segments when * split to the hardware limits. @@ -777,6 +803,11 @@ static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, * at the tail of the list to preserve the sequential write order. */ bio_list_add(&zwplug->bio_list, bio); + + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + + if (schedule_bio_work) + disk_zone_wplug_schedule_bio_work(disk, zwplug); } /* @@ -970,7 +1001,10 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); if (!zwplug) { - bio_io_error(bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + else + bio_io_error(bio); return true; } @@ -979,9 +1013,11 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) /* * If the zone is already plugged or has a pending error, add the BIO - * to the plug BIO list. Otherwise, plug and let the BIO execute. + * to the plug BIO list. Do the same for REQ_NOWAIT BIOs to ensure that + * we will not see a BLK_STS_AGAIN failure if we let the BIO execute. + * Otherwise, plug and let the BIO execute. */ - if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY || (bio->bi_opf & REQ_NOWAIT)) goto plug; /* @@ -998,8 +1034,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return false; plug: - zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - blk_zone_wplug_add_bio(zwplug, bio, nr_segs); + disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1083,19 +1118,6 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) } EXPORT_SYMBOL_GPL(blk_zone_plug_bio); -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - /* - * Take a reference on the zone write plug and schedule the submission - * of the next plugged BIO. blk_zone_wplug_bio_work() will release the - * reference we take here. - */ - WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); - refcount_inc(&zwplug->ref); - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); -} - static void disk_zone_wplug_unplug_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug) { From 5eb3317aa5a2ffe4574ab1a12cf9bc9447ca26c0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:55 +0900 Subject: [PATCH 135/330] block: Ignore REQ_NOWAIT for zone reset and zone finish operations There are currently any issuer of REQ_OP_ZONE_RESET and REQ_OP_ZONE_FINISH operations that set REQ_NOWAIT. However, as we cannot handle this flag correctly due to the potential request allocation failure that may happen in blk_mq_submit_bio() after blk_zone_plug_bio() has handled the zone write plug write pointer updates for the targeted zones, modify blk_zone_wplug_handle_reset_or_finish() to warn if this flag is set and ignore it. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7982b9494d633..ee9c67121c6c8 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -707,6 +707,15 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, return true; } + /* + * No-wait reset or finish BIOs do not make much sense as the callers + * issue these as blocking operations in most cases. To avoid issues + * the BIO execution potentially failing with BLK_STS_AGAIN, warn about + * REQ_NOWAIT being set and ignore that flag. + */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) + bio->bi_opf &= ~REQ_NOWAIT; + /* * If we have a zone write plug, set its write pointer offset to 0 * (reset case) or to the zone size (finish case). This will abort all From b76b840fd93374240b59825f1ab8e2f5c9907acb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:56 +0900 Subject: [PATCH 136/330] dm: Fix dm-zoned-reclaim zone write pointer alignment The zone reclaim processing of the dm-zoned device mapper uses blkdev_issue_zeroout() to align the write pointer of a zone being used for reclaiming another zone, to write the valid data blocks from the zone being reclaimed at the same position relative to the zone start in the reclaim target zone. The first call to blkdev_issue_zeroout() will try to use hardware offload using a REQ_OP_WRITE_ZEROES operation if the device reports a non-zero max_write_zeroes_sectors queue limit. If this operation fails because of the lack of hardware support, blkdev_issue_zeroout() falls back to using a regular write operation with the zero-page as buffer. Currently, such REQ_OP_WRITE_ZEROES failure is automatically handled by the block layer zone write plugging code which will execute a report zones operation to ensure that the write pointer of the target zone of the failed operation has not changed and to "rewind" the zone write pointer offset of the target zone as it was advanced when the write zero operation was submitted. So the REQ_OP_WRITE_ZEROES failure does not cause any issue and blkdev_issue_zeroout() works as expected. However, since the automatic recovery of zone write pointers by the zone write plugging code can potentially cause deadlocks with queue freeze operations, a different recovery must be implemented in preparation for the removal of zone write plugging report zones based recovery. Do this by introducing the new function blk_zone_issue_zeroout(). This function first calls blkdev_issue_zeroout() with the flag BLKDEV_ZERO_NOFALLBACK to intercept failures on the first execution which attempt to use the device hardware offload with the REQ_OP_WRITE_ZEROES operation. If this attempt fails, a report zone operation is issued to restore the zone write pointer offset of the target zone to the correct position and blkdev_issue_zeroout() is called again without the BLKDEV_ZERO_NOFALLBACK flag. The report zones operation performing this recovery is implemented using the helper function disk_zone_sync_wp_offset() which calls the gendisk report_zones file operation with the callback disk_report_zones_cb(). This callback updates the target write pointer offset of the target zone using the new function disk_zone_wplug_sync_wp_offset(). dmz_reclaim_align_wp() is modified to change its call to blkdev_issue_zeroout() to a call to blk_zone_issue_zeroout() without any other change needed as the two functions are functionnally equivalent. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 141 ++++++++++++++++++++++++++++------ drivers/md/dm-zoned-reclaim.c | 6 +- include/linux/blkdev.h | 3 + 3 files changed, 124 insertions(+), 26 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ee9c67121c6c8..781caca0381e0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -115,6 +115,30 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); +struct disk_report_zones_cb_args { + struct gendisk *disk; + report_zones_cb user_cb; + void *user_data; +}; + +static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone); + +static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct disk_report_zones_cb_args *args = data; + struct gendisk *disk = args->disk; + + if (disk->zone_wplugs_hash) + disk_zone_wplug_sync_wp_offset(disk, zone); + + if (!args->user_cb) + return 0; + + return args->user_cb(zone, idx, args->user_data); +} + /** * blkdev_report_zones - Get zones information * @bdev: Target block device @@ -694,6 +718,58 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, spin_unlock_irqrestore(&zwplug->lock, flags); } +static unsigned int blk_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + return 0; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. + */ + return UINT_MAX; + } +} + +static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone) +{ + struct blk_zone_wplug *zwplug; + unsigned long flags; + + zwplug = disk_get_zone_wplug(disk, zone->start); + if (!zwplug) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) + disk_zone_wplug_set_wp_offset(disk, zwplug, + blk_zone_wp_offset(zone)); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + +static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) +{ + struct disk_report_zones_cb_args args = { + .disk = disk, + }; + + return disk->fops->report_zones(disk, sector, 1, + disk_report_zones_cb, &args); +} + static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, unsigned int wp_offset) { @@ -1280,29 +1356,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) disk_put_zone_wplug(zwplug); } -static unsigned int blk_zone_wp_offset(struct blk_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; - case BLK_ZONE_COND_EMPTY: - return 0; - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - default: - /* - * Conventional, offline and read-only zones do not have a valid - * write pointer. - */ - return UINT_MAX; - } -} - static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { @@ -1866,6 +1919,48 @@ int blk_revalidate_disk_zones(struct gendisk *disk) } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); +/** + * blk_zone_issue_zeroout - zero-fill a block range in a zone + * @bdev: blockdev to write + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Description: + * Zero-fill a block range in a zone (@sector must be equal to the zone write + * pointer), handling potential errors due to the (initially unknown) lack of + * hardware offload (See blkdev_issue_zeroout()). + */ +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) +{ + int ret; + + if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) + return -EIO; + + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, + BLKDEV_ZERO_NOFALLBACK); + if (ret != -EOPNOTSUPP) + return ret; + + /* + * The failed call to blkdev_issue_zeroout() advanced the zone write + * pointer. Undo this using a report zone to update the zone write + * pointer to the correct current value. + */ + ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); + if (ret != 1) + return ret < 0 ? ret : -EIO; + + /* + * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a + * regular write with zero-pages. + */ + return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); +} +EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); + #ifdef CONFIG_BLK_DEBUG_FS int queue_zone_wplugs_show(void *data, struct seq_file *m) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d58db9a27e6cf..76e2c68685487 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, * pointer and the requested position. */ nr_blocks = block - wp_block; - ret = blkdev_issue_zeroout(dev->bdev, - dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), - dmz_blk2sect(nr_blocks), GFP_NOIO, 0); + ret = blk_zone_issue_zeroout(dev->bdev, + dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), + dmz_blk2sect(nr_blocks), GFP_NOIO); if (ret) { dmz_dev_err(dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 08a727b408164..4dd698dad2d65 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1421,6 +1421,9 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) return is_seq; } +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask); + static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; From fe0418eb9bd69a19a948b297c8de815e05f3cde1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:57 +0900 Subject: [PATCH 137/330] block: Prevent potential deadlocks in zone write plug error recovery Zone write plugging for handling writes to zones of a zoned block device always execute a zone report whenever a write BIO to a zone fails. The intent of this is to ensure that the tracking of a zone write pointer is always correct to ensure that the alignment to a zone write pointer of write BIOs can be checked on submission and that we can always correctly emulate zone append operations using regular write BIOs. However, this error recovery scheme introduces a potential deadlock if a device queue freeze is initiated while BIOs are still plugged in a zone write plug and one of these write operation fails. In such case, the disk zone write plug error recovery work is scheduled and executes a report zone. This in turn can result in a request allocation in the underlying driver to issue the report zones command to the device. But with the device queue freeze already started, this allocation will block, preventing the report zone execution and the continuation of the processing of the plugged BIOs. As plugged BIOs hold a queue usage reference, the queue freeze itself will never complete, resulting in a deadlock. Avoid this problem by completely removing from the zone write plugging code the use of report zones operations after a failed write operation, instead relying on the device user to either execute a report zones, reset the zone, finish the zone, or give up writing to the device (which is a fairly common pattern for file systems which degrade to read-only after write failures). This is not an unreasonnable requirement as all well-behaved applications, FSes and device mapper already use report zones to recover from write errors whenever possible by comparing the current position of a zone write pointer with what their assumption about the position is. The changes to remove the automatic error recovery are as follows: - Completely remove the error recovery work and its associated resources (zone write plug list head, disk error list, and disk zone_wplugs_work work struct). This also removes the functions disk_zone_wplug_set_error() and disk_zone_wplug_clear_error(). - Change the BLK_ZONE_WPLUG_ERROR zone write plug flag into BLK_ZONE_WPLUG_NEED_WP_UPDATE. This new flag is set for a zone write plug whenever a write opration targetting the zone of the zone write plug fails. This flag indicates that the zone write pointer offset is not reliable and that it must be updated when the next report zone, reset zone, finish zone or disk revalidation is executed. - Modify blk_zone_write_plug_bio_endio() to set the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag for the target zone of a failed write BIO. - Modify the function disk_zone_wplug_set_wp_offset() to clear this new flag, thus implementing recovery of a correct write pointer offset with the reset (all) zone and finish zone operations. - Modify blkdev_report_zones() to always use the disk_report_zones_cb() callback so that disk_zone_wplug_sync_wp_offset() can be called for any zone marked with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag. This implements recovery of a correct write pointer offset for zone write plugs marked with BLK_ZONE_WPLUG_NEED_WP_UPDATE and within the range of the report zones operation executed by the user. - Modify blk_revalidate_seq_zone() to call disk_zone_wplug_sync_wp_offset() for all sequential write required zones when a zoned block device is revalidated, thus always resolving any inconsistency between the write pointer offset of zone write plugs and the actual write pointer position of sequential zones. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 308 ++++++++--------------------------------- include/linux/blkdev.h | 2 - 2 files changed, 61 insertions(+), 249 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 781caca0381e0..77f12fc2086fa 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -41,7 +41,6 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. - * @link: To list the plug in the zone write plug error list of the disk. * @ref: Zone write plug reference counter. A zone write plug reference is * always at least 1 when the plug is hashed in the disk plug hash table. * The reference is incremented whenever a new BIO needing plugging is @@ -63,7 +62,6 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; - struct list_head link; refcount_t ref; spinlock_t lock; unsigned int flags; @@ -80,8 +78,8 @@ struct blk_zone_wplug { * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, * that is, that write BIOs are being throttled due to a write BIO already * being executed or the zone write plug bio list is not empty. - * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be - * recovered with a report zone to update the zone write pointer offset. + * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone + * write pointer offset and need to update it. * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed * from the disk hash table and that the initial reference to the zone * write plug set when the plug was first added to the hash table has been @@ -91,11 +89,9 @@ struct blk_zone_wplug { * freed once all remaining references from BIOs or functions are dropped. */ #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) -#define BLK_ZONE_WPLUG_ERROR (1U << 1) +#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) -#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) - /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. @@ -163,6 +159,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, { struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); + struct disk_report_zones_cb_args args = { + .disk = disk, + .user_cb = cb, + .user_data = data, + }; if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; @@ -170,7 +171,8 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= capacity) return 0; - return disk->fops->report_zones(disk, sector, nr_zones, cb, data); + return disk->fops->report_zones(disk, sector, nr_zones, + disk_report_zones_cb, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); @@ -451,7 +453,7 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) { if (refcount_dec_and_test(&zwplug->ref)) { WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - WARN_ON_ONCE(!list_empty(&zwplug->link)); + WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); @@ -465,8 +467,8 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) return false; - /* If the zone write plug is still busy, it cannot be removed. */ - if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + /* If the zone write plug is still plugged, it cannot be removed. */ + if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) return false; /* @@ -549,7 +551,6 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, return NULL; INIT_HLIST_NODE(&zwplug->node); - INIT_LIST_HEAD(&zwplug->link); refcount_set(&zwplug->ref, 2); spin_lock_init(&zwplug->lock); zwplug->flags = 0; @@ -598,115 +599,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) } /* - * Abort (fail) all plugged BIOs of a zone write plug that are not aligned - * with the assumed write pointer location of the zone when the BIO will - * be unplugged. - */ -static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned int wp_offset = zwplug->wp_offset; - struct bio_list bl = BIO_EMPTY_LIST; - struct bio *bio; - - while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || - (bio_op(bio) != REQ_OP_ZONE_APPEND && - bio_offset_from_zone_start(bio) != wp_offset)) { - blk_zone_wplug_bio_io_error(zwplug, bio); - continue; - } - - wp_offset += bio_sectors(bio); - bio_list_add(&bl, bio); - } - - bio_list_merge(&zwplug->bio_list, &bl); -} - -static inline void disk_zone_wplug_set_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned long flags; - - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) - return; - - /* - * At this point, we already have a reference on the zone write plug. - * However, since we are going to add the plug to the disk zone write - * plugs work list, increase its reference count. This reference will - * be dropped in disk_zone_wplugs_work() once the error state is - * handled, or in disk_zone_wplug_clear_error() if the zone is reset or - * finished. - */ - zwplug->flags |= BLK_ZONE_WPLUG_ERROR; - refcount_inc(&zwplug->ref); - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - -static inline void disk_zone_wplug_clear_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned long flags; - - if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) - return; - - /* - * We are racing with the error handling work which drops the reference - * on the zone write plug after handling the error state. So remove the - * plug from the error list and drop its reference count only if the - * error handling has not yet started, that is, if the zone write plug - * is still listed. - */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - if (!list_empty(&zwplug->link)) { - list_del_init(&zwplug->link); - zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; - disk_put_zone_wplug(zwplug); - } - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - -/* - * Set a zone write plug write pointer offset to either 0 (zone reset case) - * or to the zone size (zone finish case). This aborts all plugged BIOs, which - * is fine to do as doing a zone reset or zone finish while writes are in-flight - * is a mistake from the user which will most likely cause all plugged BIOs to - * fail anyway. + * Set a zone write plug write pointer offset to the specified value. + * This aborts all plugged BIOs, which is fine as this function is called for + * a zone reset operation, a zone finish operation or if the zone needs a wp + * update from a report zone after a write error. */ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, struct blk_zone_wplug *zwplug, unsigned int wp_offset) { - unsigned long flags; - - spin_lock_irqsave(&zwplug->lock, flags); - - /* - * Make sure that a BIO completion or another zone reset or finish - * operation has not already removed the plug from the hash table. - */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } + lockdep_assert_held(&zwplug->lock); /* Update the zone write pointer and abort all plugged BIOs. */ + zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; zwplug->wp_offset = wp_offset; disk_zone_wplug_abort(zwplug); - /* - * Updating the write pointer offset puts back the zone - * in a good state. So clear the error flag and decrement the - * error count if we were in error state. - */ - disk_zone_wplug_clear_error(disk, zwplug); - /* * The zone write plug now has no BIO plugged: remove it from the * hash table so that it cannot be seen. The plug will be freed @@ -714,8 +622,6 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, */ if (disk_should_remove_zone_wplug(disk, zwplug)) disk_remove_zone_wplug(disk, zwplug); - - spin_unlock_irqrestore(&zwplug->lock, flags); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) @@ -752,7 +658,7 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, return; spin_lock_irqsave(&zwplug->lock, flags); - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) disk_zone_wplug_set_wp_offset(disk, zwplug, blk_zone_wp_offset(zone)); spin_unlock_irqrestore(&zwplug->lock, flags); @@ -776,6 +682,7 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, struct gendisk *disk = bio->bi_bdev->bd_disk; sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; + unsigned long flags; /* Conventional zones cannot be reset nor finished. */ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { @@ -801,7 +708,9 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, */ zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { + spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } @@ -812,6 +721,7 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; + unsigned long flags; sector_t sector; /* @@ -823,7 +733,9 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio) sector += disk->queue->limits.chunk_sectors) { zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { + spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } } @@ -1005,13 +917,23 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, { struct gendisk *disk = bio->bi_bdev->bd_disk; + /* + * If we lost track of the zone write pointer due to a write error, + * the user must either execute a report zones, reset the zone or finish + * the to recover a reliable write pointer position. Fail BIOs if the + * user did not do that as we cannot handle emulated zone append + * otherwise. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) + return false; + /* * Check that the user is not attempting to write to a full zone. * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ if (disk_zone_wplug_is_full(disk, zwplug)) - goto err; + return false; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { /* @@ -1030,24 +952,18 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); } else { /* - * Check for non-sequential writes early because we avoid a - * whole lot of error handling trouble if we don't send it off - * to the driver. + * Check for non-sequential writes early as we know that BIOs + * with a start sector not unaligned to the zone write pointer + * will fail. */ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) - goto err; + return false; } /* Advance the zone write pointer offset. */ zwplug->wp_offset += bio_sectors(bio); return true; - -err: - /* We detected an invalid write BIO: schedule error recovery. */ - disk_zone_wplug_set_error(disk, zwplug); - kblockd_schedule_work(&disk->zone_wplugs_work); - return false; } static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) @@ -1097,20 +1013,20 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* - * If the zone is already plugged or has a pending error, add the BIO - * to the plug BIO list. Do the same for REQ_NOWAIT BIOs to ensure that - * we will not see a BLK_STS_AGAIN failure if we let the BIO execute. + * If the zone is already plugged, add the BIO to the plug BIO list. + * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a + * BLK_STS_AGAIN failure if we let the BIO execute. * Otherwise, plug and let the BIO execute. */ - if (zwplug->flags & BLK_ZONE_WPLUG_BUSY || (bio->bi_opf & REQ_NOWAIT)) + if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || + (bio->bi_opf & REQ_NOWAIT)) goto plug; - /* - * If an error is detected when preparing the BIO, add it to the BIO - * list so that error recovery can deal with it. - */ - if (!blk_zone_wplug_prepare_bio(zwplug, bio)) - goto plug; + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + spin_unlock_irqrestore(&zwplug->lock, flags); + bio_io_error(bio); + return true; + } zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; @@ -1210,16 +1126,6 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* - * If we had an error, schedule error recovery. The recovery work - * will restart submission of plugged BIOs. - */ - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { - spin_unlock_irqrestore(&zwplug->lock, flags); - kblockd_schedule_work(&disk->zone_wplugs_work); - return; - } - /* Schedule submission of the next plugged BIO if we have one. */ if (!bio_list_empty(&zwplug->bio_list)) { disk_zone_wplug_schedule_bio_work(disk, zwplug); @@ -1262,12 +1168,13 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) } /* - * If the BIO failed, mark the plug as having an error to trigger - * recovery. + * If the BIO failed, abort all plugged BIOs and mark the plug as + * needing a write pointer update. */ if (bio->bi_status != BLK_STS_OK) { spin_lock_irqsave(&zwplug->lock, flags); - disk_zone_wplug_set_error(disk, zwplug); + disk_zone_wplug_abort(zwplug); + zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1323,6 +1230,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) */ spin_lock_irqsave(&zwplug->lock, flags); +again: bio = bio_list_pop(&zwplug->bio_list); if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; @@ -1331,10 +1239,8 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) } if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { - /* Error recovery will decide what to do with the BIO. */ - bio_list_add_head(&zwplug->bio_list, bio); - spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + blk_zone_wplug_bio_io_error(zwplug, bio); + goto again; } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1356,97 +1262,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) disk_put_zone_wplug(zwplug); } -static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, - unsigned int idx, void *data) -{ - struct blk_zone *zonep = data; - - *zonep = *zone; - return 0; -} - -static void disk_zone_wplug_handle_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - sector_t zone_start_sector = - bdev_zone_sectors(disk->part0) * zwplug->zone_no; - unsigned int noio_flag; - struct blk_zone zone; - unsigned long flags; - int ret; - - /* Get the current zone information from the device. */ - noio_flag = memalloc_noio_save(); - ret = disk->fops->report_zones(disk, zone_start_sector, 1, - blk_zone_wplug_report_zone_cb, &zone); - memalloc_noio_restore(noio_flag); - - spin_lock_irqsave(&zwplug->lock, flags); - - /* - * A zone reset or finish may have cleared the error already. In such - * case, do nothing as the report zones may have seen the "old" write - * pointer value before the reset/finish operation completed. - */ - if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) - goto unlock; - - zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; - - if (ret != 1) { - /* - * We failed to get the zone information, meaning that something - * is likely really wrong with the device. Abort all remaining - * plugged BIOs as otherwise we could endup waiting forever on - * plugged BIOs to complete if there is a queue freeze on-going. - */ - disk_zone_wplug_abort(zwplug); - goto unplug; - } - - /* Update the zone write pointer offset. */ - zwplug->wp_offset = blk_zone_wp_offset(&zone); - disk_zone_wplug_abort_unaligned(disk, zwplug); - - /* Restart BIO submission if we still have any BIO left. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - goto unlock; - } - -unplug: - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); - -unlock: - spin_unlock_irqrestore(&zwplug->lock, flags); -} - -static void disk_zone_wplugs_work(struct work_struct *work) -{ - struct gendisk *disk = - container_of(work, struct gendisk, zone_wplugs_work); - struct blk_zone_wplug *zwplug; - unsigned long flags; - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - - while (!list_empty(&disk->zone_wplugs_err_list)) { - zwplug = list_first_entry(&disk->zone_wplugs_err_list, - struct blk_zone_wplug, link); - list_del_init(&zwplug->link); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - - disk_zone_wplug_handle_error(disk, zwplug); - disk_put_zone_wplug(zwplug); - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - } - - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) { return 1U << disk->zone_wplugs_hash_bits; @@ -1455,8 +1270,6 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_lock); - INIT_LIST_HEAD(&disk->zone_wplugs_err_list); - INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); } /* @@ -1555,8 +1368,6 @@ void disk_free_zone_resources(struct gendisk *disk) if (!disk->zone_wplugs_pool) return; - cancel_work_sync(&disk->zone_wplugs_work); - if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; @@ -1753,6 +1564,8 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!disk->zone_wplugs_hash) return 0; + disk_zone_wplug_sync_wp_offset(disk, zone); + wp_offset = blk_zone_wp_offset(zone); if (!wp_offset || wp_offset >= zone->capacity) return 0; @@ -1883,6 +1696,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) memalloc_noio_restore(noio_flag); return ret; } + ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); if (!ret) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4dd698dad2d65..378d3a1a22fca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,8 +200,6 @@ struct gendisk { spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; - struct list_head zone_wplugs_err_list; - struct work_struct zone_wplugs_work; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ From 6f604b59c2841168131523e25f5e36afa17306f4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 8 Dec 2024 19:53:50 +0800 Subject: [PATCH 138/330] MAINTAINERS: update Coly Li's email address This patch updates Coly Li's email addres to colyli@kernel.org. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20241208115350.85103-1-colyli@kernel.org Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b13..c15e9e57906a1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3891,7 +3891,7 @@ W: http://www.baycom.org/~tom/ham/ham.html F: drivers/net/hamradio/baycom* BCACHE (BLOCK LAYER CACHE) -M: Coly Li +M: Coly Li M: Kent Overstreet L: linux-bcache@vger.kernel.org S: Maintained From 86e6ca55b83c575ab0f2e105cf08f98e58d3d7af Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Dec 2024 07:59:51 -1000 Subject: [PATCH 139/330] blk-cgroup: Fix UAF in blkcg_unpin_online() blkcg_unpin_online() walks up the blkcg hierarchy putting the online pin. To walk up, it uses blkcg_parent(blkcg) but it was calling that after blkcg_destroy_blkgs(blkcg) which could free the blkcg, leading to the following UAF: ================================================================== BUG: KASAN: slab-use-after-free in blkcg_unpin_online+0x15a/0x270 Read of size 8 at addr ffff8881057678c0 by task kworker/9:1/117 CPU: 9 UID: 0 PID: 117 Comm: kworker/9:1 Not tainted 6.13.0-rc1-work-00182-gb8f52214c61a-dirty #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 02/02/2022 Workqueue: cgwb_release cgwb_release_workfn Call Trace: dump_stack_lvl+0x27/0x80 print_report+0x151/0x710 kasan_report+0xc0/0x100 blkcg_unpin_online+0x15a/0x270 cgwb_release_workfn+0x194/0x480 process_scheduled_works+0x71b/0xe20 worker_thread+0x82a/0xbd0 kthread+0x242/0x2c0 ret_from_fork+0x33/0x70 ret_from_fork_asm+0x1a/0x30 ... Freed by task 1944: kasan_save_track+0x2b/0x70 kasan_save_free_info+0x3c/0x50 __kasan_slab_free+0x33/0x50 kfree+0x10c/0x330 css_free_rwork_fn+0xe6/0xb30 process_scheduled_works+0x71b/0xe20 worker_thread+0x82a/0xbd0 kthread+0x242/0x2c0 ret_from_fork+0x33/0x70 ret_from_fork_asm+0x1a/0x30 Note that the UAF is not easy to trigger as the free path is indirected behind a couple RCU grace periods and a work item execution. I could only trigger it with artifical msleep() injected in blkcg_unpin_online(). Fix it by reading the parent pointer before destroying the blkcg's blkg's. Signed-off-by: Tejun Heo Reported-by: Abagail ren Suggested-by: Linus Torvalds Fixes: 4308a434e5e0 ("blkcg: don't offline parent blkcg first") Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e68c725cf8d97..45a395862fbc8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1324,10 +1324,14 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { + struct blkcg *parent; + if (!refcount_dec_and_test(&blkcg->online_pin)) break; + + parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); - blkcg = blkcg_parent(blkcg); + blkcg = parent; } while (blkcg); } From 75e072a390da9a22e7ae4a4e8434dfca5da499fb Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 2 Dec 2024 12:29:23 +0100 Subject: [PATCH 140/330] bpf, sockmap: Fix update element with same Consider a sockmap entry being updated with the same socket: osk = stab->sks[idx]; sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); Due to sock_map_unref(), which invokes sock_map_del_link(), all the psock's links for stab->sks[idx] are torn: list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { ... list_del(&link->list); sk_psock_free_link(link); } } And that includes the new link sock_map_add_link() added just before the unref. This results in a sockmap holding a socket, but without the respective link. This in turn means that close(sock) won't trigger the cleanup, i.e. a closed socket will not be automatically removed from the sockmap. Stop tearing the links when a matching link_raw is found. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241202-sockmap-replace-v1-1-1e88579e7bd5@rbox.co --- net/core/sock_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 78347d7d25ef3..20b348b1964a1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -159,6 +159,7 @@ static void sock_map_del_link(struct sock *sk, verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); + break; } } spin_unlock_bh(&psock->link_lock); From ed1fc5d76b81a4d681211333c026202cad4d5649 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 2 Dec 2024 12:29:25 +0100 Subject: [PATCH 141/330] bpf, sockmap: Fix race between element replace and close() Element replace (with a socket different from the one stored) may race with socket's close() link popping & unlinking. __sock_map_delete() unconditionally unrefs the (wrong) element: // set map[0] = s0 map_update_elem(map, 0, s0) // drop fd of s0 close(s0) sock_map_close() lock_sock(sk) (s0!) sock_map_remove_links(sk) link = sk_psock_link_pop() sock_map_unlink(sk, link) sock_map_delete_from_link // replace map[0] with s1 map_update_elem(map, 0, s1) sock_map_update_elem (s1!) lock_sock(sk) sock_map_update_common psock = sk_psock(sk) spin_lock(&stab->lock) osk = stab->sks[idx] sock_map_add_link(..., &stab->sks[idx]) sock_map_unref(osk, &stab->sks[idx]) psock = sk_psock(osk) sk_psock_put(sk, psock) if (refcount_dec_and_test(&psock)) sk_psock_drop(sk, psock) spin_unlock(&stab->lock) unlock_sock(sk) __sock_map_delete spin_lock(&stab->lock) sk = *psk // s1 replaced s0; sk == s1 if (!sk_test || sk_test == sk) // sk_test (s0) != sk (s1); no branch sk = xchg(psk, NULL) if (sk) sock_map_unref(sk, psk) // unref s1; sks[idx] will dangle psock = sk_psock(sk) sk_psock_put(sk, psock) if (refcount_dec_and_test()) sk_psock_drop(sk, psock) spin_unlock(&stab->lock) release_sock(sk) Then close(map) enqueues bpf_map_free_deferred, which finally calls sock_map_free(). This results in some refcount_t warnings along with a KASAN splat [1]. Fix __sock_map_delete(), do not allow sock_map_unref() on elements that may have been replaced. [1]: BUG: KASAN: slab-use-after-free in sock_map_free+0x10e/0x330 Write of size 4 at addr ffff88811f5b9100 by task kworker/u64:12/1063 CPU: 14 UID: 0 PID: 1063 Comm: kworker/u64:12 Not tainted 6.12.0+ #125 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 Workqueue: events_unbound bpf_map_free_deferred Call Trace: dump_stack_lvl+0x68/0x90 print_report+0x174/0x4f6 kasan_report+0xb9/0x190 kasan_check_range+0x10f/0x1e0 sock_map_free+0x10e/0x330 bpf_map_free_deferred+0x173/0x320 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 Allocated by task 1202: kasan_save_stack+0x1e/0x40 kasan_save_track+0x10/0x30 __kasan_slab_alloc+0x85/0x90 kmem_cache_alloc_noprof+0x131/0x450 sk_prot_alloc+0x5b/0x220 sk_alloc+0x2c/0x870 unix_create1+0x88/0x8a0 unix_create+0xc5/0x180 __sock_create+0x241/0x650 __sys_socketpair+0x1ce/0x420 __x64_sys_socketpair+0x92/0x100 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 46: kasan_save_stack+0x1e/0x40 kasan_save_track+0x10/0x30 kasan_save_free_info+0x37/0x60 __kasan_slab_free+0x4b/0x70 kmem_cache_free+0x1a1/0x590 __sk_destruct+0x388/0x5a0 sk_psock_destroy+0x73e/0xa50 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 The buggy address belongs to the object at ffff88811f5b9080 which belongs to the cache UNIX-STREAM of size 1984 The buggy address is located 128 bytes inside of freed 1984-byte region [ffff88811f5b9080, ffff88811f5b9840) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11f5b8 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff888127d49401 flags: 0x17ffffc0000040(head|node=0|zone=2|lastcpupid=0x1fffff) page_type: f5(slab) raw: 0017ffffc0000040 ffff8881042e4500 dead000000000122 0000000000000000 raw: 0000000000000000 00000000800f000f 00000001f5000000 ffff888127d49401 head: 0017ffffc0000040 ffff8881042e4500 dead000000000122 0000000000000000 head: 0000000000000000 00000000800f000f 00000001f5000000 ffff888127d49401 head: 0017ffffc0000003 ffffea00047d6e01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88811f5b9000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88811f5b9080: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88811f5b9180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88811f5b9200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Disabling lock debugging due to kernel taint refcount_t: addition on 0; use-after-free. WARNING: CPU: 14 PID: 1063 at lib/refcount.c:25 refcount_warn_saturate+0xce/0x150 CPU: 14 UID: 0 PID: 1063 Comm: kworker/u64:12 Tainted: G B 6.12.0+ #125 Tainted: [B]=BAD_PAGE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0xce/0x150 Code: 34 73 eb 03 01 e8 82 53 ad fe 0f 0b eb b1 80 3d 27 73 eb 03 00 75 a8 48 c7 c7 80 bd 95 84 c6 05 17 73 eb 03 01 e8 62 53 ad fe <0f> 0b eb 91 80 3d 06 73 eb 03 00 75 88 48 c7 c7 e0 bd 95 84 c6 05 RSP: 0018:ffff88815c49fc70 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88811f5b9100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000001 RBP: 0000000000000002 R08: 0000000000000001 R09: ffffed10bcde6349 R10: ffff8885e6f31a4b R11: 0000000000000000 R12: ffff88813be0b000 R13: ffff88811f5b9100 R14: ffff88811f5b9080 R15: ffff88813be0b024 FS: 0000000000000000(0000) GS:ffff8885e6f00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dda99b0250 CR3: 000000015dbac000 CR4: 0000000000752ef0 PKRU: 55555554 Call Trace: ? __warn.cold+0x5f/0x1ff ? refcount_warn_saturate+0xce/0x150 ? report_bug+0x1ec/0x390 ? handle_bug+0x58/0x90 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? refcount_warn_saturate+0xce/0x150 sock_map_free+0x2e5/0x330 bpf_map_free_deferred+0x173/0x320 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 irq event stamp: 10741 hardirqs last enabled at (10741): [] asm_sysvec_apic_timer_interrupt+0x16/0x20 hardirqs last disabled at (10740): [] handle_softirqs+0x60d/0x770 softirqs last enabled at (10506): [] __irq_exit_rcu+0x109/0x210 softirqs last disabled at (10301): [] __irq_exit_rcu+0x109/0x210 refcount_t: underflow; use-after-free. WARNING: CPU: 14 PID: 1063 at lib/refcount.c:28 refcount_warn_saturate+0xee/0x150 CPU: 14 UID: 0 PID: 1063 Comm: kworker/u64:12 Tainted: G B W 6.12.0+ #125 Tainted: [B]=BAD_PAGE, [W]=WARN Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0xee/0x150 Code: 17 73 eb 03 01 e8 62 53 ad fe 0f 0b eb 91 80 3d 06 73 eb 03 00 75 88 48 c7 c7 e0 bd 95 84 c6 05 f6 72 eb 03 01 e8 42 53 ad fe <0f> 0b e9 6e ff ff ff 80 3d e6 72 eb 03 00 0f 85 61 ff ff ff 48 c7 RSP: 0018:ffff88815c49fc70 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88811f5b9100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000001 RBP: 0000000000000003 R08: 0000000000000001 R09: ffffed10bcde6349 R10: ffff8885e6f31a4b R11: 0000000000000000 R12: ffff88813be0b000 R13: ffff88811f5b9100 R14: ffff88811f5b9080 R15: ffff88813be0b024 FS: 0000000000000000(0000) GS:ffff8885e6f00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dda99b0250 CR3: 000000015dbac000 CR4: 0000000000752ef0 PKRU: 55555554 Call Trace: ? __warn.cold+0x5f/0x1ff ? refcount_warn_saturate+0xee/0x150 ? report_bug+0x1ec/0x390 ? handle_bug+0x58/0x90 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? refcount_warn_saturate+0xee/0x150 sock_map_free+0x2d3/0x330 bpf_map_free_deferred+0x173/0x320 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 irq event stamp: 10741 hardirqs last enabled at (10741): [] asm_sysvec_apic_timer_interrupt+0x16/0x20 hardirqs last disabled at (10740): [] handle_softirqs+0x60d/0x770 softirqs last enabled at (10506): [] __irq_exit_rcu+0x109/0x210 softirqs last disabled at (10301): [] __irq_exit_rcu+0x109/0x210 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241202-sockmap-replace-v1-3-1e88579e7bd5@rbox.co --- net/core/sock_map.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 20b348b1964a1..f1b9b3958792c 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -412,12 +412,11 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key) static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { - struct sock *sk; + struct sock *sk = NULL; int err = 0; spin_lock_bh(&stab->lock); - sk = *psk; - if (!sk_test || sk_test == sk) + if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) From 11d5245f608f8ac01c97b93f31497cef7b96e457 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 2 Dec 2024 12:29:24 +0100 Subject: [PATCH 142/330] selftests/bpf: Extend test for sockmap update with same Verify that the sockmap link was not severed, and socket's entry is indeed removed from the map when the corresponding descriptor gets closed. Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241202-sockmap-replace-v1-2-1e88579e7bd5@rbox.co --- tools/testing/selftests/bpf/prog_tests/sockmap_basic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index fdff0652d7ef5..248754296d972 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -934,8 +934,10 @@ static void test_sockmap_same_sock(void) err = socketpair(AF_UNIX, SOCK_STREAM, 0, stream); ASSERT_OK(err, "socketpair(af_unix, sock_stream)"); - if (err) + if (err) { + close(tcp); goto out; + } for (i = 0; i < 2; i++) { err = bpf_map_update_elem(map, &zero, &stream[0], BPF_ANY); @@ -954,14 +956,14 @@ static void test_sockmap_same_sock(void) ASSERT_OK(err, "bpf_map_update_elem(tcp)"); } + close(tcp); err = bpf_map_delete_elem(map, &zero); - ASSERT_OK(err, "bpf_map_delete_elem(entry)"); + ASSERT_ERR(err, "bpf_map_delete_elem(entry)"); close(stream[0]); close(stream[1]); out: close(dgram); - close(tcp); close(udp); test_sockmap_pass_prog__destroy(skel); } From c0cd2941bceca784864dd21199cd8e6e7ce9e906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Sz=C5=91ke?= Date: Mon, 20 May 2024 16:26:47 +0200 Subject: [PATCH 143/330] arc: rename aux.h to arc_aux.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal is to clean-up Linux repository from AUX file names, because the use of such file names is prohibited on other operating systems such as Windows, so the Linux repository cannot be cloned and edited on them. Reviewed-by: Shahab Vahedi Signed-off-by: Benjamin Szőke Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 2 +- arch/arc/include/asm/mmu-arcv2.h | 2 +- include/soc/arc/{aux.h => arc_aux.h} | 0 include/soc/arc/mcip.h | 2 +- include/soc/arc/timers.h | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename include/soc/arc/{aux.h => arc_aux.h} (100%) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 4b13f60fe7cae..005d9e4d187a0 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -146,7 +146,7 @@ #ifndef __ASSEMBLY__ -#include +#include /* Helpers */ #define TO_KB(bytes) ((bytes) >> 10) diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h index d85dc07219071..41412642f2796 100644 --- a/arch/arc/include/asm/mmu-arcv2.h +++ b/arch/arc/include/asm/mmu-arcv2.h @@ -9,7 +9,7 @@ #ifndef _ASM_ARC_MMU_ARCV2_H #define _ASM_ARC_MMU_ARCV2_H -#include +#include /* * TLB Management regs diff --git a/include/soc/arc/aux.h b/include/soc/arc/arc_aux.h similarity index 100% rename from include/soc/arc/aux.h rename to include/soc/arc/arc_aux.h diff --git a/include/soc/arc/mcip.h b/include/soc/arc/mcip.h index d1a93c73f0062..a78dacd149f12 100644 --- a/include/soc/arc/mcip.h +++ b/include/soc/arc/mcip.h @@ -8,7 +8,7 @@ #ifndef __SOC_ARC_MCIP_H #define __SOC_ARC_MCIP_H -#include +#include #define ARC_REG_MCIP_BCR 0x0d0 #define ARC_REG_MCIP_IDU_BCR 0x0D5 diff --git a/include/soc/arc/timers.h b/include/soc/arc/timers.h index ae99d3e855f1d..51a74166296c6 100644 --- a/include/soc/arc/timers.h +++ b/include/soc/arc/timers.h @@ -6,7 +6,7 @@ #ifndef __SOC_ARC_TIMERS_H #define __SOC_ARC_TIMERS_H -#include +#include /* Timer related Aux registers */ #define ARC_REG_TIMER0_LIMIT 0x23 /* timer 0 limit */ From 8871331b1769978ecece205a430338a2581e5050 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 9 Oct 2024 10:33:22 -0700 Subject: [PATCH 144/330] ARC: build: disallow invalid PAE40 + 4K page config The config option being built was | CONFIG_ARC_MMU_V4=y | CONFIG_ARC_PAGE_SIZE_4K=y | CONFIG_HIGHMEM=y | CONFIG_ARC_HAS_PAE40=y This was hitting a BUILD_BUG_ON() since a 4K page can't hoist 1k, 8-byte PTE entries (8 byte due to PAE40). BUILD_BUG_ON() is a good last ditch resort, but such a config needs to be disallowed explicitly in Kconfig. Side-note: the actual fix is single liner dependency, but while at it cleaned out a few things: - 4K dependency on MMU v3 or v4 is always true, since 288ff7de62af09 ("ARC: retire MMUv1 and MMUv2 support") - PAE40 dependency in on MMU ver not really ISA, although that follows eventually. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409160223.xydgucbY-lkp@intel.com/ Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 5b24881420414..69c6e71fa1e6b 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -297,7 +297,6 @@ config ARC_PAGE_SIZE_16K config ARC_PAGE_SIZE_4K bool "4KB" select HAVE_PAGE_SIZE_4KB - depends on ARC_MMU_V3 || ARC_MMU_V4 endchoice @@ -474,7 +473,8 @@ config HIGHMEM config ARC_HAS_PAE40 bool "Support for the 40-bit Physical Address Extension" - depends on ISA_ARCV2 + depends on MMU_V4 + depends on !ARC_PAGE_SIZE_4K select HIGHMEM select PHYS_ADDR_T_64BIT help From dd2b2302efffab751d1c2d9a436365db0ee55a42 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 28 Oct 2024 10:59:07 +0100 Subject: [PATCH 145/330] ARC: fix reference of dependency for PAE40 config Commit d71e629bed5b ("ARC: build: disallow invalid PAE40 + 4K page config") reworks the build dependencies for ARC_HAS_PAE40, and accidentally refers to the non-existing config option MMU_V4 rather than the intended option ARC_MMU_V4. Note the missing prefix in the name here. Refer to the intended config option in the dependency of the ARC_HAS_PAE40 config. Fixes: d71e629bed5b ("ARC: build: disallow invalid PAE40 + 4K page config") Signed-off-by: Lukas Bulwahn Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 69c6e71fa1e6b..ea5a1dcb133b4 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -473,7 +473,7 @@ config HIGHMEM config ARC_HAS_PAE40 bool "Support for the 40-bit Physical Address Extension" - depends on MMU_V4 + depends on ARC_MMU_V4 depends on !ARC_PAGE_SIZE_4K select HIGHMEM select PHYS_ADDR_T_64BIT From 1e8af9f04346ecc0bccf0c53b728fc8eb3490a28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 10:55:13 -0700 Subject: [PATCH 146/330] ARC: build: Use __force to suppress per-CPU cmpxchg warnings Currently, the cast of the first argument to cmpxchg_emu_u8() drops the __percpu address-space designator, which results in sparse complaints when applying cmpxchg() to per-CPU variables in ARC. Therefore, use __force to suppress these complaints, given that this does not pertain to cmpxchg() semantics, which are plently well-defined on variables in general, whether per-CPU or otherwise. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409251336.ToC0TvWB-lkp@intel.com/ Signed-off-by: Paul E. McKenney Cc: Signed-off-by: Vineet Gupta --- arch/arc/include/asm/cmpxchg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index 58045c8983404..76f43db0890fc 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -48,7 +48,7 @@ \ switch(sizeof((_p_))) { \ case 1: \ - _prev_ = (__typeof__(*(ptr)))cmpxchg_emu_u8((volatile u8 *)_p_, (uintptr_t)_o_, (uintptr_t)_n_); \ + _prev_ = (__typeof__(*(ptr)))cmpxchg_emu_u8((volatile u8 *__force)_p_, (uintptr_t)_o_, (uintptr_t)_n_); \ break; \ case 4: \ _prev_ = __cmpxchg(_p_, _o_, _n_); \ From 4d93ffe661ff59d2d66231b4054fe9c9658bd16c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 22 Oct 2024 11:14:27 +0200 Subject: [PATCH 147/330] ARC: dts: Replace deprecated snps,nr-gpios property for snps,dw-apb-gpio-port devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit snps,dw-apb-gpio-port is deprecated since commit ef42a8da3cf3 ("dt-bindings: gpio: dwapb: Add ngpios property support"). The respective driver supports this since commit 7569486d79ae ("gpio: dwapb: Add ngpios DT-property support") which is included in Linux v5.10-rc1. This change was created using git grep -l snps,nr-gpios arch/arc/boot/dts | xargs perl -p -i -e 's/\bsnps,nr-gpios\b/ngpios/ . Signed-off-by: Uwe Kleine-König Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/axc001.dtsi | 2 +- arch/arc/boot/dts/axc003.dtsi | 2 +- arch/arc/boot/dts/axc003_idu.dtsi | 2 +- arch/arc/boot/dts/axs10x_mb.dtsi | 12 ++++++------ arch/arc/boot/dts/hsdk.dts | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi index 2a151607b0805..88bcc7ab6f5af 100644 --- a/arch/arc/boot/dts/axc001.dtsi +++ b/arch/arc/boot/dts/axc001.dtsi @@ -54,7 +54,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <30>; + ngpios = <30>; reg = <0>; interrupt-controller; #interrupt-cells = <2>; diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index c0a812674ce9e..9a2dc39a5cff4 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -62,7 +62,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <30>; + ngpios = <30>; reg = <0>; interrupt-controller; #interrupt-cells = <2>; diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 67556f4b70574..f31382cb8be4b 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -69,7 +69,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <30>; + ngpios = <30>; reg = <0>; interrupt-controller; #interrupt-cells = <2>; diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index b644353853049..3add2fe257f8b 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -250,7 +250,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <32>; + ngpios = <32>; reg = <0>; }; @@ -258,7 +258,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <8>; + ngpios = <8>; reg = <1>; }; @@ -266,7 +266,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <8>; + ngpios = <8>; reg = <2>; }; }; @@ -281,7 +281,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <30>; + ngpios = <30>; reg = <0>; }; @@ -289,7 +289,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <10>; + ngpios = <10>; reg = <1>; }; @@ -297,7 +297,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <8>; + ngpios = <8>; reg = <2>; }; }; diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 41b980df862b1..98bb850722a4b 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -308,7 +308,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <24>; + ngpios = <24>; reg = <0>; }; }; From 7dd9eb6ba88964b091b89855ce7d2a12405013af Mon Sep 17 00:00:00 2001 From: Hardevsinh Palaniya Date: Wed, 13 Nov 2024 19:11:18 +0530 Subject: [PATCH 148/330] ARC: bpf: Correct conditional check in 'check_jmp_32' The original code checks 'if (ARC_CC_AL)', which is always true since ARC_CC_AL is a constant. This makes the check redundant and likely obscures the intention of verifying whether the jump is conditional. Updates the code to check cond == ARC_CC_AL instead, reflecting the intent to differentiate conditional from unconditional jumps. Suggested-by: Vadim Fedorenko Reviewed-by: Vadim Fedorenko Acked-by: Shahab Vahedi Signed-off-by: Hardevsinh Palaniya Signed-off-by: Vineet Gupta --- arch/arc/net/bpf_jit_arcv2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c index 4458e409ca0a8..6d989b6d88c69 100644 --- a/arch/arc/net/bpf_jit_arcv2.c +++ b/arch/arc/net/bpf_jit_arcv2.c @@ -2916,7 +2916,7 @@ bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond) addendum = (cond == ARC_CC_AL) ? 0 : INSN_len_normal; disp = get_displacement(curr_off + addendum, targ_off); - if (ARC_CC_AL) + if (cond == ARC_CC_AL) return is_valid_far_disp(disp); else return is_valid_near_disp(disp); From ef1b808e3b7c98612feceedf985c2fbbeb28f956 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 10 Dec 2024 17:32:13 +0100 Subject: [PATCH 149/330] bpf: Fix UAF via mismatching bpf_prog/attachment RCU flavors Uprobes always use bpf_prog_run_array_uprobe() under tasks-trace-RCU protection. But it is possible to attach a non-sleepable BPF program to a uprobe, and non-sleepable BPF programs are freed via normal RCU (see __bpf_prog_put_noref()). This leads to UAF of the bpf_prog because a normal RCU grace period does not imply a tasks-trace-RCU grace period. Fix it by explicitly waiting for a tasks-trace-RCU grace period after removing the attachment of a bpf_prog to a perf_event. Fixes: 8c7dcb84e3b7 ("bpf: implement sleepable uprobes by chaining gps") Suggested-by: Andrii Nakryiko Suggested-by: Alexei Starovoitov Signed-off-by: Jann Horn Signed-off-by: Andrii Nakryiko Cc: stable@vger.kernel.org Link: https://lore.kernel.org/bpf/20241210-bpf-fix-actual-uprobe-uaf-v1-1-19439849dd44@google.com --- kernel/trace/bpf_trace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 949a3870946c3..a403b05a70913 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2258,6 +2258,13 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_free_sleepable(old_array); } + /* + * It could be that the bpf_prog is not sleepable (and will be freed + * via normal RCU), but is called from a point that supports sleepable + * programs and uses tasks-trace-RCU. + */ + synchronize_rcu_tasks_trace(); + bpf_prog_put(event->prog); event->prog = NULL; From 978c4486cca5c7b9253d3ab98a88c8e769cb9bbd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 8 Dec 2024 15:25:07 +0100 Subject: [PATCH 150/330] bpf,perf: Fix invalid prog_array access in perf_event_detach_bpf_prog Syzbot reported [1] crash that happens for following tracing scenario: - create tracepoint perf event with attr.inherit=1, attach it to the process and set bpf program to it - attached process forks -> chid creates inherited event the new child event shares the parent's bpf program and tp_event (hence prog_array) which is global for tracepoint - exit both process and its child -> release both events - first perf_event_detach_bpf_prog call will release tp_event->prog_array and second perf_event_detach_bpf_prog will crash, because tp_event->prog_array is NULL The fix makes sure the perf_event_detach_bpf_prog checks prog_array is valid before it tries to remove the bpf program from it. [1] https://lore.kernel.org/bpf/Z1MR6dCIKajNS6nU@krava/T/#m91dbf0688221ec7a7fc95e896a7ef9ff93b0b8ad Fixes: 0ee288e69d03 ("bpf,perf: Fix perf_event_detach_bpf_prog error handling") Reported-by: syzbot+2e0d2840414ce817aaac@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241208142507.1207698-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a403b05a70913..1b8db5aee9d38 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2250,6 +2250,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); + if (!old_array) + goto put; + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); @@ -2258,6 +2261,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_free_sleepable(old_array); } +put: /* * It could be that the bpf_prog is not sleepable (and will be freed * via normal RCU), but is called from a point that supports sleepable From 27e88bc4df1d80888fe1aaca786a7cc6e69587e2 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:53 -0800 Subject: [PATCH 151/330] bpf: add find_containing_subprog() utility function Add a utility function, looking for a subprogram containing a given instruction index, rewrite find_subprog() to use this function. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 01fbef9576e00..277c1892bb9a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2597,16 +2597,36 @@ static int cmp_subprogs(const void *a, const void *b) ((struct bpf_subprog_info *)b)->start; } +/* Find subprogram that contains instruction at 'off' */ +static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *vals = env->subprog_info; + int l, r, m; + + if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0) + return NULL; + + l = 0; + r = env->subprog_cnt - 1; + while (l < r) { + m = l + (r - l + 1) / 2; + if (vals[m].start <= off) + l = m; + else + r = m - 1; + } + return &vals[l]; +} + +/* Find subprogram that starts exactly at 'off' */ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_info, env->subprog_cnt, - sizeof(env->subprog_info[0]), cmp_subprogs); - if (!p) + p = find_containing_subprog(env, off); + if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; - } static int add_subprog(struct bpf_verifier_env *env, int off) From b238e187b4a2d3b54d80aec05a9cab6466b79dde Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:54 -0800 Subject: [PATCH 152/330] bpf: refactor bpf_helper_changes_pkt_data to use helper number Use BPF helper number instead of function pointer in bpf_helper_changes_pkt_data(). This would simplify usage of this function in verifier.c:check_cfg() (in a follow-up patch), where only helper number is easily available and there is no real need to lookup helper proto. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 63 +++++++++++++++++++----------------------- 4 files changed, 31 insertions(+), 38 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 3a21947f2fd44..0477254bc2d30 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1122,7 +1122,7 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); -bool bpf_helper_changes_pkt_data(void *func); +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a2327c4fdc8bc..6fa8041d48318 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2936,7 +2936,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog) { } -bool __weak bpf_helper_changes_pkt_data(void *func) +bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 277c1892bb9a8..ad3f6d28e8e4f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10728,7 +10728,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } /* With LD_ABS/IND some JITs save/restore skb from r1. */ - changes_data = bpf_helper_changes_pkt_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", func_id_name(func_id), func_id); diff --git a/net/core/filter.c b/net/core/filter.c index 6625b3f563a4a..efb75eed2e353 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7899,42 +7899,35 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { #endif /* CONFIG_INET */ -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == sk_skb_change_head || - func == bpf_skb_change_tail || - func == sk_skb_change_tail || - func == bpf_skb_adjust_room || - func == sk_skb_adjust_room || - func == bpf_skb_pull_data || - func == sk_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_msg_push_data || - func == bpf_msg_pop_data || - func == bpf_xdp_adjust_tail || -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) - func == bpf_lwt_seg6_store_bytes || - func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action || -#endif -#ifdef CONFIG_INET - func == bpf_sock_ops_store_hdr_opt || -#endif - func == bpf_lwt_in_push_encap || - func == bpf_lwt_xmit_push_encap) +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: return true; - - return false; + default: + return false; + } } const struct bpf_func_proto bpf_event_output_data_proto __weak; From 51081a3f25c742da5a659d7fc6fd77ebfdd555be Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:55 -0800 Subject: [PATCH 153/330] bpf: track changes_pkt_data property for global functions When processing calls to certain helpers, verifier invalidates all packet pointers in a current state. For example, consider the following program: __attribute__((__noinline__)) long skb_pull_data(struct __sk_buff *sk, __u32 len) { return bpf_skb_pull_data(sk, len); } SEC("tc") int test_invalidate_checks(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; if ((void *)(p + 1) > (void *)(long)sk->data_end) return TCX_DROP; skb_pull_data(sk, 0); *p = 42; return TCX_PASS; } After a call to bpf_skb_pull_data() the pointer 'p' can't be used safely. See function filter.c:bpf_helper_changes_pkt_data() for a list of such helpers. At the moment verifier invalidates packet pointers when processing helper function calls, and does not traverse global sub-programs when processing calls to global sub-programs. This means that calls to helpers done from global sub-programs do not invalidate pointers in the caller state. E.g. the program above is unsafe, but is not rejected by verifier. This commit fixes the omission by computing field bpf_subprog_info->changes_pkt_data for each sub-program before main verification pass. changes_pkt_data should be set if: - subprogram calls helper for which bpf_helper_changes_pkt_data returns true; - subprogram calls a global function, for which bpf_subprog_info->changes_pkt_data should be set. The verifier.c:check_cfg() pass is modified to compute this information. The commit relies on depth first instruction traversal done by check_cfg() and absence of recursive function calls: - check_cfg() would eventually visit every call to subprogram S in a state when S is fully explored; - when S is fully explored: - every direct helper call within S is explored (and thus changes_pkt_data is set if needed); - every call to subprogram S1 called by S was visited with S1 fully explored (and thus S inherits changes_pkt_data from S1). The downside of such approach is that dead code elimination is not taken into account: if a helper call inside global function is dead because of current configuration, verifier would conservatively assume that the call occurs for the purpose of the changes_pkt_data computation. Reported-by: Nick Zavaritsky Closes: https://lore.kernel.org/bpf/0498CA22-5779-4767-9C0C-A9515CEA711F@gmail.com/ Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f4290c179bee0..48b7b2eeb7e21 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -659,6 +659,7 @@ struct bpf_subprog_info { bool args_cached: 1; /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; + bool changes_pkt_data: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ad3f6d28e8e4f..6a29b68cebd6f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10042,6 +10042,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + if (env->subprog_info[subprog].changes_pkt_data) + clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); @@ -16246,6 +16248,29 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return 0; } +static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->changes_pkt_data = true; +} + +/* 't' is an index of a call-site. + * 'w' is a callee entry point. + * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. + * Rely on DFS traversal order and absence of recursive calls to guarantee that + * callee's change_pkt_data marks would be correct at that moment. + */ +static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) +{ + struct bpf_subprog_info *caller, *callee; + + caller = find_containing_subprog(env, t); + callee = find_containing_subprog(env, w); + caller->changes_pkt_data |= callee->changes_pkt_data; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -16379,6 +16404,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, bool visit_callee) { int ret, insn_sz; + int w; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); @@ -16390,8 +16416,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, mark_jmp_point(env, t + insn_sz); if (visit_callee) { + w = t + insns[t].imm + 1; mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + merge_callee_effects(env, t, w); + ret = push_insn(t, w, BRANCH, env); } return ret; } @@ -16708,6 +16736,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } + if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; From 3f23ee5590d9605dbde9a5e1d4b97637a4803329 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:56 -0800 Subject: [PATCH 154/330] selftests/bpf: test for changing packet data from global functions Check if verifier is aware of packet pointers invalidation done in global functions. Based on a test shared by Nick Zavaritsky in [0]. [0] https://lore.kernel.org/bpf/0498CA22-5779-4767-9C0C-A9515CEA711F@gmail.com/ Suggested-by: Nick Zavaritsky Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_sock.c | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index d3e70e38e4420..51826379a1aae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1037,4 +1037,32 @@ __naked void sock_create_read_src_port(void) : __clobber_all); } +__noinline +long skb_pull_data2(struct __sk_buff *sk, __u32 len) +{ + return bpf_skb_pull_data(sk, len); +} + +__noinline +long skb_pull_data1(struct __sk_buff *sk, __u32 len) +{ + return skb_pull_data2(sk, len); +} + +/* global function calls bpf_skb_pull_data(), which invalidates packet + * pointers established before global function call. + */ +SEC("tc") +__failure __msg("invalid mem access") +int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk) +{ + int *p = (void *)(long)sk->data; + + if ((void *)(p + 1) > (void *)(long)sk->data_end) + return TCX_DROP; + skb_pull_data1(sk, 0); + *p = 42; /* this is unsafe */ + return TCX_PASS; +} + char _license[] SEC("license") = "GPL"; From 81f6d0530ba031b5f038a091619bf2ff29568852 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:57 -0800 Subject: [PATCH 155/330] bpf: check changes_pkt_data property for extension programs When processing calls to global sub-programs, verifier decides whether to invalidate all packet pointers in current state depending on the changes_pkt_data property of the global sub-program. Because of this, an extension program replacing a global sub-program must be compatible with changes_pkt_data property of the sub-program being replaced. This commit: - adds changes_pkt_data flag to struct bpf_prog_aux: - this flag is set in check_cfg() for main sub-program; - in jit_subprogs() for other sub-programs; - modifies bpf_check_attach_btf_id() to check changes_pkt_data flag; - moves call to check_attach_btf_id() after the call to check_cfg(), because it needs changes_pkt_data flag to be set: bpf_check: ... ... - check_attach_btf_id resolve_pseudo_ldimm64 resolve_pseudo_ldimm64 --> bpf_prog_is_offloaded bpf_prog_is_offloaded check_cfg check_cfg + check_attach_btf_id ... ... The following fields are set by check_attach_btf_id(): - env->ops - prog->aux->attach_btf_trace - prog->aux->attach_func_name - prog->aux->attach_func_proto - prog->aux->dst_trampoline - prog->aux->mod - prog->aux->saved_dst_attach_type - prog->aux->saved_dst_prog_type - prog->expected_attach_type Neither of these fields are used by resolve_pseudo_ldimm64() or bpf_prog_offload_verifier_prep() (for netronome and netdevsim drivers), so the reordering is safe. Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c1..fe392d0749730 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1527,6 +1527,7 @@ struct bpf_prog_aux { bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; bool priv_stack_requested; + bool changes_pkt_data; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a29b68cebd6f..c2e5d0e6e3d01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16872,6 +16872,7 @@ static int check_cfg(struct bpf_verifier_env *env) } } ret = 0; /* cfg looks good */ + env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; err_free: kvfree(insn_state); @@ -20361,6 +20362,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -22225,6 +22227,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension programs should be JITed\n"); return -EINVAL; } + if (prog->aux->changes_pkt_data && + !aux->func[subprog]->aux->changes_pkt_data) { + bpf_log(log, + "Extension program changes packet data, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22690,10 +22698,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); if (ret < 0) goto skip_full_check; @@ -22708,6 +22712,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; From 89ff40890d8f12a7d7e93fb602cc27562f3834f0 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:58 -0800 Subject: [PATCH 156/330] selftests/bpf: freplace tests for tracking of changes_packet_data Try different combinations of global functions replacement: - replace function that changes packet data with one that doesn't; - replace function that changes packet data with one that does; - replace function that doesn't change packet data with one that does; - replace function that doesn't change packet data with one that doesn't; Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/changes_pkt_data.c | 76 +++++++++++++++++++ .../selftests/bpf/progs/changes_pkt_data.c | 26 +++++++ .../bpf/progs/changes_pkt_data_freplace.c | 18 +++++ 3 files changed, 120 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c create mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data.c create mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c diff --git a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c new file mode 100644 index 0000000000000..c0c7202f6c5c5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bpf/libbpf.h" +#include "changes_pkt_data_freplace.skel.h" +#include "changes_pkt_data.skel.h" +#include + +static void print_verifier_log(const char *log) +{ + if (env.verbosity >= VERBOSE_VERY) + fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log); +} + +static void test_aux(const char *main_prog_name, const char *freplace_prog_name, bool expect_load) +{ + struct changes_pkt_data_freplace *freplace = NULL; + struct bpf_program *freplace_prog = NULL; + LIBBPF_OPTS(bpf_object_open_opts, opts); + struct changes_pkt_data *main = NULL; + char log[16*1024]; + int err; + + opts.kernel_log_buf = log; + opts.kernel_log_size = sizeof(log); + if (env.verbosity >= VERBOSE_SUPER) + opts.kernel_log_level = 1 | 2 | 4; + main = changes_pkt_data__open_opts(&opts); + if (!ASSERT_OK_PTR(main, "changes_pkt_data__open")) + goto out; + err = changes_pkt_data__load(main); + print_verifier_log(log); + if (!ASSERT_OK(err, "changes_pkt_data__load")) + goto out; + freplace = changes_pkt_data_freplace__open_opts(&opts); + if (!ASSERT_OK_PTR(freplace, "changes_pkt_data_freplace__open")) + goto out; + freplace_prog = bpf_object__find_program_by_name(freplace->obj, freplace_prog_name); + if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog")) + goto out; + bpf_program__set_autoload(freplace_prog, true); + bpf_program__set_autoattach(freplace_prog, true); + bpf_program__set_attach_target(freplace_prog, + bpf_program__fd(main->progs.dummy), + main_prog_name); + err = changes_pkt_data_freplace__load(freplace); + print_verifier_log(log); + if (expect_load) { + ASSERT_OK(err, "changes_pkt_data_freplace__load"); + } else { + ASSERT_ERR(err, "changes_pkt_data_freplace__load"); + ASSERT_HAS_SUBSTR(log, "Extension program changes packet data", "error log"); + } + +out: + changes_pkt_data_freplace__destroy(freplace); + changes_pkt_data__destroy(main); +} + +/* There are two global subprograms in both changes_pkt_data.skel.h: + * - one changes packet data; + * - another does not. + * It is ok to freplace subprograms that change packet data with those + * that either do or do not. It is only ok to freplace subprograms + * that do not change packet data with those that do not as well. + * The below tests check outcomes for each combination of such freplace. + */ +void test_changes_pkt_data_freplace(void) +{ + if (test__start_subtest("changes_with_changes")) + test_aux("changes_pkt_data", "changes_pkt_data", true); + if (test__start_subtest("changes_with_doesnt_change")) + test_aux("changes_pkt_data", "does_not_change_pkt_data", true); + if (test__start_subtest("doesnt_change_with_changes")) + test_aux("does_not_change_pkt_data", "changes_pkt_data", false); + if (test__start_subtest("doesnt_change_with_doesnt_change")) + test_aux("does_not_change_pkt_data", "does_not_change_pkt_data", true); +} diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data.c b/tools/testing/selftests/bpf/progs/changes_pkt_data.c new file mode 100644 index 0000000000000..f87da8e9d6b33 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/changes_pkt_data.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +__noinline +long changes_pkt_data(struct __sk_buff *sk, __u32 len) +{ + return bpf_skb_pull_data(sk, len); +} + +__noinline __weak +long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len) +{ + return 0; +} + +SEC("tc") +int dummy(struct __sk_buff *sk) +{ + changes_pkt_data(sk, 0); + does_not_change_pkt_data(sk, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c new file mode 100644 index 0000000000000..0e525beb8603c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +SEC("?freplace") +long changes_pkt_data(struct __sk_buff *sk, __u32 len) +{ + return bpf_skb_pull_data(sk, len); +} + +SEC("?freplace") +long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; From 1a4607ffba35bf2a630aab299e34dd3f6e658d70 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:59 -0800 Subject: [PATCH 157/330] bpf: consider that tail calls invalidate packet pointers Tail-called programs could execute any of the helpers that invalidate packet pointers. Hence, conservatively assume that each tail call invalidates packet pointers. Making the change in bpf_helper_changes_pkt_data() automatically makes use of check_cfg() logic that computes 'changes_pkt_data' effect for global sub-programs, such that the following program could be rejected: int tail_call(struct __sk_buff *sk) { bpf_tail_call_static(sk, &jmp_table, 0); return 0; } SEC("tc") int not_safe(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; ... make p valid ... tail_call(sk); *p = 42; /* this is unsafe */ ... } The tc_bpf2bpf.c:subprog_tc() needs change: mark it as a function that can invalidate packet pointers. Otherwise, it can't be freplaced with tailcall_freplace.c:entry_freplace() that does a tail call. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ tools/testing/selftests/bpf/progs/tc_bpf2bpf.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index efb75eed2e353..21131ec25f24a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7924,6 +7924,8 @@ bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) case BPF_FUNC_xdp_adjust_head: case BPF_FUNC_xdp_adjust_meta: case BPF_FUNC_xdp_adjust_tail: + /* tail-called program could call any of the above */ + case BPF_FUNC_tail_call: return true; default: return false; diff --git a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c index d1a57f7d09bd8..fe6249d99b315 100644 --- a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c @@ -11,6 +11,8 @@ int subprog_tc(struct __sk_buff *skb) __sink(skb); __sink(ret); + /* let verifier know that 'subprog_tc' can change pointers to skb->data */ + bpf_skb_change_proto(skb, 0, 0); return ret; } From d9706b56e13b7916461ca6b4b731e169ed44ed09 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:11:00 -0800 Subject: [PATCH 158/330] selftests/bpf: validate that tail call invalidates packet pointers Add a test case with a tail call done from a global sub-program. Such tails calls should be considered as invalidating packet pointers. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_sock.c | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 51826379a1aae..0d5e56dffabb8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -50,6 +50,13 @@ struct { __uint(map_flags, BPF_F_NO_PREALLOC); } sk_storage_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + SEC("cgroup/skb") __description("skb->sk: no NULL check") __failure __msg("invalid mem access 'sock_common_or_null'") @@ -1065,4 +1072,25 @@ int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk) return TCX_PASS; } +__noinline +int tail_call(struct __sk_buff *sk) +{ + bpf_tail_call_static(sk, &jmp_table, 0); + return 0; +} + +/* Tail calls invalidate packet pointers. */ +SEC("tc") +__failure __msg("invalid mem access") +int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) +{ + int *p = (void *)(long)sk->data; + + if ((void *)(p + 1) > (void *)(long)sk->data_end) + return TCX_DROP; + tail_call(sk); + *p = 42; /* this is unsafe */ + return TCX_PASS; +} + char _license[] SEC("license") = "GPL"; From ccb84dc8f4a02e7d30ffd388522996546b4d00e1 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 3 Dec 2024 14:37:29 +0000 Subject: [PATCH 159/330] Documentation: PM: Clarify pm_runtime_resume_and_get() return value Update the documentation to match the behaviour of the code. pm_runtime_resume_and_get() always returns 0 on success, even if __pm_runtime_resume() returns 1. Fixes: 2c412337cfe6 ("PM: runtime: Add documentation for pm_runtime_resume_and_get()") Signed-off-by: Paul Barker Link: https://patch.msgid.link/20241203143729.478-1-paul.barker.ct@bp.renesas.com [ rjw: Subject and changelog edits, adjusted new comment formatting ] Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 53d1996460abf..12f429359a823 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -347,7 +347,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: `int pm_runtime_resume_and_get(struct device *dev);` - run pm_runtime_resume(dev) and if successful, increment the device's - usage counter; return the result of pm_runtime_resume + usage counter; returns 0 on success (whether or not the device's + runtime PM status was already 'active') or the error code from + pm_runtime_resume() on failure. `int pm_request_idle(struct device *dev);` - submit a request to execute the subsystem-level idle callback for the From c4441ca86afe4814039ee1b32c39d833c1a16bbc Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 10 Dec 2024 11:42:45 +0000 Subject: [PATCH 160/330] bpf: fix potential error return The bpf_remove_insns() function returns WARN_ON_ONCE(error), where error is a result of bpf_adj_branches(), and thus should be always 0 However, if for any reason it is not 0, then it will be converted to boolean by WARN_ON_ONCE and returned to user space as 1, not an actual error value. Fix this by returning the original err after the WARN check. Signed-off-by: Anton Protopopov Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241210114245.836164-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6fa8041d48318..da729cbbaeb90 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -539,6 +539,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { + int err; + /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ @@ -546,7 +548,9 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; - return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); + err = bpf_adj_branches(prog, off, off + cnt, off, false); + WARN_ON_ONCE(err); + return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) From 790eb09e59709a1ffc1c64fe4aae2789120851b0 Mon Sep 17 00:00:00 2001 From: LongPing Wei Date: Thu, 7 Nov 2024 10:04:41 +0800 Subject: [PATCH 161/330] block: get wp_offset by bdev_offset_from_zone_start Call bdev_offset_from_zone_start() instead of open-coding it. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: LongPing Wei Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20241107020439.1644577-1-weilongping@oppo.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 77f12fc2086fa..84da1eadff642 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -555,7 +555,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, spin_lock_init(&zwplug->lock); zwplug->flags = 0; zwplug->zone_no = zno; - zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); + zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); zwplug->disk = disk; From 7d0d673627e20cfa3b21a829a896ce03b58a4f1c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 10 Dec 2024 20:08:14 +0100 Subject: [PATCH 162/330] bpf: Fix theoretical prog_array UAF in __uprobe_perf_func() Currently, the pointer stored in call->prog_array is loaded in __uprobe_perf_func(), with no RCU annotation and no immediately visible RCU protection, so it looks as if the loaded pointer can immediately be dangling. Later, bpf_prog_run_array_uprobe() starts a RCU-trace read-side critical section, but this is too late. It then uses rcu_dereference_check(), but this use of rcu_dereference_check() does not actually dereference anything. Fix it by aligning the semantics to bpf_prog_run_array(): Let the caller provide rcu_read_lock_trace() protection and then load call->prog_array with rcu_dereference_check(). This issue seems to be theoretical: I don't know of any way to reach this code without having handle_swbp() further up the stack, which is already holding a rcu_read_lock_trace() lock, so where we take rcu_read_lock_trace() in __uprobe_perf_func()/bpf_prog_run_array_uprobe() doesn't actually have any effect. Fixes: 8c7dcb84e3b7 ("bpf: implement sleepable uprobes by chaining gps") Suggested-by: Andrii Nakryiko Signed-off-by: Jann Horn Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241210-bpf-fix-uprobe-uaf-v4-1-5fc8959b2b74@google.com --- include/linux/bpf.h | 13 +++++-------- kernel/trace/trace_uprobe.c | 6 +++++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe392d0749730..805040813f5d5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2194,26 +2194,25 @@ bpf_prog_run_array(const struct bpf_prog_array *array, * rcu-protected dynamically sized maps. */ static __always_inline u32 -bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; might_fault(); + RCU_LOCKDEP_WARN(!rcu_read_lock_trace_held(), "no rcu lock held"); + + if (unlikely(!array)) + return ret; - rcu_read_lock_trace(); migrate_disable(); run_ctx.is_uprobe = true; - array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); - if (unlikely(!array)) - goto out; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -2228,9 +2227,7 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); -out: migrate_enable(); - rcu_read_unlock_trace(); return ret; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fed382b7881b8..4875e7f5de3db 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1402,9 +1402,13 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, #ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { + const struct bpf_prog_array *array; u32 ret; - ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run); + rcu_read_lock_trace(); + array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held()); + ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run); + rcu_read_unlock_trace(); if (!ret) return; } From da4d8c83358163df9a4addaeba0ef8bcb03b22e8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 15 Nov 2024 09:00:32 -0800 Subject: [PATCH 163/330] cxl/pci: Fix potential bogus return value upon successful probing If cxl_pci_ras_unmask() returns non-zero, cxl_pci_probe() will end up returning that value, instead of zero. Fixes: 248529edc86f ("cxl: add RAS status unmasking for CXL") Reviewed-by: Fan Ni Signed-off-by: Davidlohr Bueso Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20241115170032.108445-1-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0241d1d7133a4..26ab06c9deffb 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1032,8 +1032,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = cxl_pci_ras_unmask(pdev); - if (rc) + if (cxl_pci_ras_unmask(pdev)) dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); pci_save_state(pdev); From 09ceba3a93450b652ae6910b6f65be99885f4437 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 29 Nov 2024 21:28:25 +0800 Subject: [PATCH 164/330] cxl/pci: Check dport->regs.rcd_pcie_cap availability before accessing RCD Upstream Port's PCI Express Capability is a component registers block stored in RCD Upstream Port RCRB. CXL PCI driver helps to map it during the RCD probing, but mapping failure is allowed for component registers blocks in CXL PCI driver. dport->regs.rcd_pcie_cap is used to store the virtual address of the RCD Upstream Port's PCI Express Capability, add a dport->regs.rcd_pcie_cap checking in rcd_pcie_cap_emit() just in case user accesses a invalid address via RCD sysfs. Fixes: c5eaec79fa43 ("cxl/pci: Add sysfs attribute for CXL 1.1 device link status") Signed-off-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Link: https://patch.msgid.link/20241129132825.569237-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 26ab06c9deffb..6d94ff4a4f1a6 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -836,6 +836,9 @@ static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size if (!root_dev) return -ENXIO; + if (!dport->regs.rcd_pcie_cap) + return -ENXIO; + guard(device)(root_dev); if (!root_dev->driver) return -ENXIO; From 76467a94810c2aa4dd3096903291ac6df30c399e Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Mon, 9 Dec 2024 15:33:02 -0800 Subject: [PATCH 165/330] cxl/region: Fix region creation for greater than x2 switches The cxl_port_setup_targets() algorithm fails to identify valid target list ordering in the presence of 4-way and above switches resulting in 'cxl create-region' failures of the form: $ cxl create-region -d decoder0.0 -g 1024 -s 2G -t ram -w 8 -m mem4 mem1 mem6 mem3 mem2 mem5 mem7 mem0 cxl region: create_region: region0: failed to set target7 to mem0 cxl region: cmd_create_region: created 0 regions [kernel debug message] check_last_peer:1213: cxl region0: pci0000:0c:port1: cannot host mem6:decoder7.0 at 2 bus_remove_device:574: bus: 'cxl': remove device region0 QEMU can create this failing topology: ACPI0017:00 [root0] | HB_0 [port1] / \ RP_0 RP_1 | | USP [port2] USP [port3] / / \ \ / / \ \ DSP DSP DSP DSP DSP DSP DSP DSP | | | | | | | | mem4 mem6 mem2 mem7 mem1 mem3 mem5 mem0 Pos: 0 2 4 6 1 3 5 7 HB: Host Bridge RP: Root Port USP: Upstream Port DSP: Downstream Port ...with the following command steps: $ qemu-system-x86_64 -machine q35,cxl=on,accel=tcg \ -smp cpus=8 \ -m 8G \ -hda /home/work/vm-images/centos-stream8-02.qcow2 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=2G,id=cxl-mem0 \ -object memory-backend-ram,size=2G,id=cxl-mem1 \ -object memory-backend-ram,size=2G,id=cxl-mem2 \ -object memory-backend-ram,size=2G,id=cxl-mem3 \ -object memory-backend-ram,size=2G,id=cxl-mem4 \ -object memory-backend-ram,size=2G,id=cxl-mem5 \ -object memory-backend-ram,size=2G,id=cxl-mem6 \ -object memory-backend-ram,size=2G,id=cxl-mem7 \ -numa node,memdev=m0,cpus=0-3,nodeid=0 \ -numa node,memdev=m1,cpus=4-7,nodeid=1 \ -netdev user,id=net0,hostfwd=tcp::2222-:22 \ -device virtio-net-pci,netdev=net0 \ -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ -device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \ -device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \ -device cxl-upstream,bus=root_port0,id=us0 \ -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \ -device cxl-type3,bus=swport0,volatile-memdev=cxl-mem0,id=cxl-vmem0 \ -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \ -device cxl-type3,bus=swport1,volatile-memdev=cxl-mem1,id=cxl-vmem1 \ -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \ -device cxl-type3,bus=swport2,volatile-memdev=cxl-mem2,id=cxl-vmem2 \ -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \ -device cxl-type3,bus=swport3,volatile-memdev=cxl-mem3,id=cxl-vmem3 \ -device cxl-upstream,bus=root_port1,id=us1 \ -device cxl-downstream,port=4,bus=us1,id=swport4,chassis=0,slot=8 \ -device cxl-type3,bus=swport4,volatile-memdev=cxl-mem4,id=cxl-vmem4 \ -device cxl-downstream,port=5,bus=us1,id=swport5,chassis=0,slot=9 \ -device cxl-type3,bus=swport5,volatile-memdev=cxl-mem5,id=cxl-vmem5 \ -device cxl-downstream,port=6,bus=us1,id=swport6,chassis=0,slot=10 \ -device cxl-type3,bus=swport6,volatile-memdev=cxl-mem6,id=cxl-vmem6 \ -device cxl-downstream,port=7,bus=us1,id=swport7,chassis=0,slot=11 \ -device cxl-type3,bus=swport7,volatile-memdev=cxl-mem7,id=cxl-vmem7 \ -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=32G & In Guest OS: $ cxl create-region -d decoder0.0 -g 1024 -s 2G -t ram -w 8 -m mem4 mem1 mem6 mem3 mem2 mem5 mem7 mem0 Fix the method to calculate @distance by iterativeley multiplying the number of targets per switch port. This also follows the algorithm recommended here [1]. Fixes: 27b3f8d13830 ("cxl/region: Program target lists") Link: http://lore.kernel.org/6538824b52349_7258329466@dwillia2-xfh.jf.intel.com.notmuch [1] Signed-off-by: Huaisheng Ye Tested-by: Li Zhijian [djbw: add a comment explaining 'distance'] Signed-off-by: Dan Williams Link: https://patch.msgid.link/173378716722.1270362.9546805175813426729.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d778996507984..b98b1ccffd1ca 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1295,6 +1295,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; struct cxl_switch_decoder *cxlsd; + struct cxl_port *iter = port; u16 eig, peig; u8 eiw, peiw; @@ -1311,16 +1312,26 @@ static int cxl_port_setup_targets(struct cxl_port *port, cxlsd = to_cxl_switch_decoder(&cxld->dev); if (cxl_rr->nr_targets_set) { - int i, distance; + int i, distance = 1; + struct cxl_region_ref *cxl_rr_iter; /* - * Passthrough decoders impose no distance requirements between - * peers + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. */ - if (cxl_rr->nr_targets == 1) - distance = 0; - else - distance = p->nr_targets / cxl_rr->nr_targets; + do { + cxl_rr_iter = cxl_rr_load(iter, cxlr); + distance *= cxl_rr_iter->nr_targets; + iter = to_cxl_port(iter->dev.parent); + } while (!is_cxl_root(iter)); + distance *= cxlrd->cxlsd.cxld.interleave_ways; + for (i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, From 2b904d61a97e8ba79e3bc216ba290fd7e1d85028 Mon Sep 17 00:00:00 2001 From: Hobin Woo Date: Thu, 5 Dec 2024 11:31:19 +0900 Subject: [PATCH 166/330] ksmbd: retry iterate_dir in smb2_query_dir Some file systems do not ensure that the single call of iterate_dir reaches the end of the directory. For example, FUSE fetches entries from a daemon using 4KB buffer and stops fetching if entries exceed the buffer. And then an actor of caller, KSMBD, is used to fill the entries from the buffer. Thus, pattern searching on FUSE, files located after the 4KB could not be found and STATUS_NO_SUCH_FILE was returned. Signed-off-by: Hobin Woo Reviewed-by: Sungjong Seo Reviewed-by: Namjae Jeon Tested-by: Yoonho Shin Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 12 +++++++++++- fs/smb/server/vfs.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 4f539eeadbb0d..558ebb2ba0fd2 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4228,6 +4228,7 @@ static bool __query_dir(struct dir_context *ctx, const char *name, int namlen, /* dot and dotdot entries are already reserved */ if (!strcmp(".", name) || !strcmp("..", name)) return true; + d_info->num_scan++; if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name)) return true; if (!match_pattern(name, namlen, priv->search_pattern)) @@ -4390,8 +4391,17 @@ int smb2_query_dir(struct ksmbd_work *work) query_dir_private.info_level = req->FileInformationClass; dir_fp->readdir_data.private = &query_dir_private; set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); - +again: + d_info.num_scan = 0; rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); + /* + * num_entry can be 0 if the directory iteration stops before reaching + * the end of the directory and no file is matched with the search + * pattern. + */ + if (rc >= 0 && !d_info.num_entry && d_info.num_scan && + d_info.out_buf_len > 0) + goto again; /* * req->OutputBufferLength is too small to contain even one entry. * In this case, it immediately returns OutputBufferLength 0 to client. diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index cb76f4b5bafe8..06903024a2d88 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -43,6 +43,7 @@ struct ksmbd_dir_info { char *rptr; int name_len; int out_buf_len; + int num_scan; int num_entry; int data_count; int last_entry_offset; From b95629435b84b9ecc0c765995204a4d8a913ed52 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 5 Dec 2024 21:38:47 +0900 Subject: [PATCH 167/330] ksmbd: fix racy issue from session lookup and expire Increment the session reference count within the lock for lookup to avoid racy issue with session expire. Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-25737 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/auth.c | 2 ++ fs/smb/server/mgmt/user_session.c | 6 +++++- fs/smb/server/server.c | 4 ++-- fs/smb/server/smb2pdu.c | 27 ++++++++++++++------------- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 1d1ffd0acaca2..2a5b4a96bf993 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -1016,6 +1016,8 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, ses_enc_key = enc ? sess->smb3encryptionkey : sess->smb3decryptionkey; + if (enc) + ksmbd_user_session_get(sess); memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); return 0; diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index df92d746e89ca..71c6939dfbf13 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -263,8 +263,10 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, down_read(&conn->session_lock); sess = xa_load(&conn->sessions, id); - if (sess) + if (sess) { sess->last_active = jiffies; + ksmbd_user_session_get(sess); + } up_read(&conn->session_lock); return sess; } @@ -275,6 +277,8 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); + if (sess) + ksmbd_user_session_get(sess); up_read(&sessions_table_lock); return sess; diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 930d7566b52ef..3ba95bd8edeb2 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -241,14 +241,14 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, if (work->tcon) ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); - if (work->sess) - ksmbd_user_session_put(work->sess); if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { rc = conn->ops->encrypt_resp(work); if (rc < 0) conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); } + if (work->sess) + ksmbd_user_session_put(work->sess); ksmbd_conn_write(work); } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 558ebb2ba0fd2..5a70df87074cd 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -67,8 +67,10 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id) return false; sess = ksmbd_session_lookup_all(conn, id); - if (sess) + if (sess) { + ksmbd_user_session_put(sess); return true; + } pr_err("Invalid user session id: %llu\n", id); return false; } @@ -605,10 +607,8 @@ int smb2_check_user_session(struct ksmbd_work *work) /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); - if (work->sess) { - ksmbd_user_session_get(work->sess); + if (work->sess) return 1; - } ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); return -ENOENT; } @@ -1701,29 +1701,35 @@ int smb2_sess_setup(struct ksmbd_work *work) if (conn->dialect != sess->dialect) { rc = -EINVAL; + ksmbd_user_session_put(sess); goto out_err; } if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) { rc = -EINVAL; + ksmbd_user_session_put(sess); goto out_err; } if (strncmp(conn->ClientGUID, sess->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { rc = -ENOENT; + ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_IN_PROGRESS) { rc = -EACCES; + ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_EXPIRED) { rc = -EFAULT; + ksmbd_user_session_put(sess); goto out_err; } + ksmbd_user_session_put(sess); if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; @@ -1731,7 +1737,8 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; } - if (ksmbd_session_lookup(conn, sess_id)) { + sess = ksmbd_session_lookup(conn, sess_id); + if (!sess) { rc = -EACCES; goto out_err; } @@ -1742,7 +1749,6 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = true; - ksmbd_user_session_get(sess); } else if ((conn->dialect < SMB30_PROT_ID || server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { @@ -1769,7 +1775,6 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = false; - ksmbd_user_session_get(sess); } work->sess = sess; @@ -2197,9 +2202,9 @@ int smb2_tree_disconnect(struct ksmbd_work *work) int smb2_session_logoff(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; struct smb2_logoff_req *req; struct smb2_logoff_rsp *rsp; - struct ksmbd_session *sess; u64 sess_id; int err; @@ -2221,11 +2226,6 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_close_session_fds(work); ksmbd_conn_wait_idle(conn); - /* - * Re-lookup session to validate if session is deleted - * while waiting request complete - */ - sess = ksmbd_session_lookup_all(conn, sess_id); if (ksmbd_tree_conn_session_logoff(sess)) { ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; @@ -8992,6 +8992,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) le64_to_cpu(tr_hdr->SessionId)); return -ECONNABORTED; } + ksmbd_user_session_put(sess); iov[0].iov_base = buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; From 21e46a79bbe6c4e1aa73b3ed998130f2ff07b128 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 6 Dec 2024 17:25:25 +0900 Subject: [PATCH 168/330] ksmbd: set ATTR_CTIME flags when setting mtime David reported that the new warning from setattr_copy_mgtime is coming like the following. [ 113.215316] ------------[ cut here ]------------ [ 113.215974] WARNING: CPU: 1 PID: 31 at fs/attr.c:300 setattr_copy+0x1ee/0x200 [ 113.219192] CPU: 1 UID: 0 PID: 31 Comm: kworker/1:1 Not tainted 6.13.0-rc1+ #234 [ 113.220127] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [ 113.221530] Workqueue: ksmbd-io handle_ksmbd_work [ksmbd] [ 113.222220] RIP: 0010:setattr_copy+0x1ee/0x200 [ 113.222833] Code: 24 28 49 8b 44 24 30 48 89 53 58 89 43 6c 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc 48 89 df e8 77 d6 ff ff e9 cd fe ff ff <0f> 0b e9 be fe ff ff 66 0 [ 113.225110] RSP: 0018:ffffaf218010fb68 EFLAGS: 00010202 [ 113.225765] RAX: 0000000000000120 RBX: ffffa446815f8568 RCX: 0000000000000003 [ 113.226667] RDX: ffffaf218010fd38 RSI: ffffa446815f8568 RDI: ffffffff94eb03a0 [ 113.227531] RBP: ffffaf218010fb90 R08: 0000001a251e217d R09: 00000000675259fa [ 113.228426] R10: 0000000002ba8a6d R11: ffffa4468196c7a8 R12: ffffaf218010fd38 [ 113.229304] R13: 0000000000000120 R14: ffffffff94eb03a0 R15: 0000000000000000 [ 113.230210] FS: 0000000000000000(0000) GS:ffffa44739d00000(0000) knlGS:0000000000000000 [ 113.231215] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 113.232055] CR2: 00007efe0053d27e CR3: 000000000331a000 CR4: 00000000000006b0 [ 113.232926] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 113.233812] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 113.234797] Call Trace: [ 113.235116] [ 113.235393] ? __warn+0x73/0xd0 [ 113.235802] ? setattr_copy+0x1ee/0x200 [ 113.236299] ? report_bug+0xf3/0x1e0 [ 113.236757] ? handle_bug+0x4d/0x90 [ 113.237202] ? exc_invalid_op+0x13/0x60 [ 113.237689] ? asm_exc_invalid_op+0x16/0x20 [ 113.238185] ? setattr_copy+0x1ee/0x200 [ 113.238692] btrfs_setattr+0x80/0x820 [btrfs] [ 113.239285] ? get_stack_info_noinstr+0x12/0xf0 [ 113.239857] ? __module_address+0x22/0xa0 [ 113.240368] ? handle_ksmbd_work+0x6e/0x460 [ksmbd] [ 113.240993] ? __module_text_address+0x9/0x50 [ 113.241545] ? __module_address+0x22/0xa0 [ 113.242033] ? unwind_next_frame+0x10e/0x920 [ 113.242600] ? __pfx_stack_trace_consume_entry+0x10/0x10 [ 113.243268] notify_change+0x2c2/0x4e0 [ 113.243746] ? stack_depot_save_flags+0x27/0x730 [ 113.244339] ? set_file_basic_info+0x130/0x2b0 [ksmbd] [ 113.244993] set_file_basic_info+0x130/0x2b0 [ksmbd] [ 113.245613] ? process_scheduled_works+0xbe/0x310 [ 113.246181] ? worker_thread+0x100/0x240 [ 113.246696] ? kthread+0xc8/0x100 [ 113.247126] ? ret_from_fork+0x2b/0x40 [ 113.247606] ? ret_from_fork_asm+0x1a/0x30 [ 113.248132] smb2_set_info+0x63f/0xa70 [ksmbd] ksmbd is trying to set the atime and mtime via notify_change without also setting the ctime. so This patch add ATTR_CTIME flags when setting mtime to avoid a warning. Reported-by: David Disseldorp Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 5a70df87074cd..803b35b89513e 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6026,15 +6026,13 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); } - attrs.ia_valid |= ATTR_CTIME; if (file_info->ChangeTime) - attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - else - attrs.ia_ctime = inode_get_ctime(inode); + inode_set_ctime_to_ts(inode, + ksmbd_NTtimeToUnix(file_info->ChangeTime)); if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); - attrs.ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); + attrs.ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME); } if (file_info->Attributes) { @@ -6076,8 +6074,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); - inode_set_ctime_to_ts(inode, attrs.ia_ctime); - attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); } From bb57c81e97e0082abfb0406ed6f67c615c3d206c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Dec 2024 11:06:32 +0000 Subject: [PATCH 169/330] cifs: Fix rmdir failure due to ongoing I/O on deleted file The cifs_io_request struct (a wrapper around netfs_io_request) holds open the file on the server, even beyond the local Linux file being closed. This can cause problems with Windows-based filesystems as the file's name still exists after deletion until the file is closed, preventing the parent directory from being removed and causing spurious test failures in xfstests due to inability to remove a directory. The symptom looks something like this in the test output: rm: cannot remove '/mnt/scratch/test/p0/d3': Directory not empty rm: cannot remove '/mnt/scratch/test/p1/dc/dae': Directory not empty Fix this by waiting in unlink and rename for any outstanding I/O requests to be completed on the target file before removing that file. Note that this doesn't prevent Linux from trying to start new requests after deletion if it still has the file open locally - something that's perfectly acceptable on a UNIX system. Note also that whilst I've marked this as fixing the commit to make cifs use netfslib, I don't know that it won't occur before that. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: David Howells Acked-by: Paulo Alcantara (Red Hat) cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 668098d3b108e..f146e06c97eb6 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1947,6 +1947,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } + netfs_wait_for_outstanding_io(inode); cifs_close_deferred_file_under_dentry(tcon, full_path); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -2464,8 +2465,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, } cifs_close_deferred_file_under_dentry(tcon, from_name); - if (d_inode(target_dentry) != NULL) + if (d_inode(target_dentry) != NULL) { + netfs_wait_for_outstanding_io(d_inode(target_dentry)); cifs_close_deferred_file_under_dentry(tcon, to_name); + } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); From 8676c4dfae5bea2582f9a999fd7463e3c79a3412 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 9 Dec 2024 12:07:09 +0100 Subject: [PATCH 170/330] cifs: Use str_yes_no() helper in cifs_ses_add_channel() Remove hard-coded strings by using the str_yes_no() helper function. Signed-off-by: Thorsten Blum Signed-off-by: Steve French --- fs/smb/client/sess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 0bb77f9ec6861..3306fb6551368 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -488,11 +488,11 @@ cifs_ses_add_channel(struct cifs_ses *ses, if (iface->sockaddr.ss_family == AF_INET) cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n", - ses, iface->speed, iface->rdma_capable ? "yes" : "no", + ses, iface->speed, str_yes_no(iface->rdma_capable), &ipv4->sin_addr); else cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI6)\n", - ses, iface->speed, iface->rdma_capable ? "yes" : "no", + ses, iface->speed, str_yes_no(iface->rdma_capable), &ipv6->sin6_addr); /* From 633609c48a358134d3f8ef8241dff24841577f58 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Tue, 10 Dec 2024 10:21:48 -0300 Subject: [PATCH 171/330] smb: client: destroy cfid_put_wq on module exit Fix potential problem in rmmod Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index c9f9b6e979648..9d96b833015c8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -2043,6 +2043,7 @@ exit_cifs(void) destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); destroy_workqueue(serverclose_wq); + destroy_workqueue(cfid_put_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } From 2aa13da97e2b92d20a8ad4ead10da89f880b64e7 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Wed, 11 Dec 2024 12:38:59 +0800 Subject: [PATCH 172/330] ASoC: tas2781: Fix calibration issue in stress test One specific test condition: the default registers of p[j].reg ~ p[j+3].reg are 0, TASDEVICE_REG(0x00, 0x14, 0x38)(PLT_FLAG_REG), TASDEVICE_REG(0x00, 0x14, 0x40)(SINEGAIN_REG), and TASDEVICE_REG(0x00, 0x14, 0x44)(SINEGAIN2_REG). After first calibration, they are freshed to TASDEVICE_REG(0x00, 0x1a, 0x20), TASDEVICE_REG(0x00, 0x16, 0x58)(PLT_FLAG_REG), TASDEVICE_REG(0x00, 0x14, 0x44)(SINEGAIN_REG), and TASDEVICE_REG(0x00, 0x16, 0x64)(SINEGAIN2_REG) via "Calibration Start" kcontrol. In second calibration, the p[j].reg ~ p[j+3].reg have already become tas2781_cali_start_reg. However, p[j+2].reg, TASDEVICE_REG(0x00, 0x14, 0x44)(SINEGAIN_REG), will be freshed to TASDEVICE_REG(0x00, 0x16, 0x64), which is the third register in the input params of the kcontrol. This is why only first calibration can work, the second-time, third-time or more-time calibration always failed without reboot. Of course, if no p[j].reg is in the list of tas2781_cali_start_reg, this stress test can work well. Fixes: 49e2e353fb0d ("ASoC: tas2781: Add Calibration Kcontrols for Chromebook") Signed-off-by: Shenghao Ding Link: https://patch.msgid.link/20241211043859.1328-1-shenghao-ding@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index be2ca5eb6c938..fb8cd2284fe85 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -370,7 +370,7 @@ static void sngl_calib_start(struct tasdevice_priv *tas_priv, int i, tasdevice_dev_read(tas_priv, i, p[j].reg, (int *)&p[j].val[0]); } else { - switch (p[j].reg) { + switch (tas2781_cali_start_reg[j].reg) { case 0: { if (!reg[0]) continue; From 220326c4650a0ef7db3bfcae903f758555ecb973 Mon Sep 17 00:00:00 2001 From: Huy Minh Date: Tue, 10 Dec 2024 22:45:00 +0700 Subject: [PATCH 173/330] platform/x86: touchscreen_dmi: Add info for SARY Tab 3 tablet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no info about the OEM behind the tablet, only online stores listing. This tablet uses an Intel Atom x5-Z8300, 4GB of RAM & 64GB of storage. Signed-off-by: Huy Minh Link: https://lore.kernel.org/r/20241210154500.32124-1-buingoc67@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/touchscreen_dmi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 0a39f68c641d1..bdc19cd8d3edf 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -855,6 +855,23 @@ static const struct ts_dmi_data rwc_nanote_next_data = { .properties = rwc_nanote_next_props, }; +static const struct property_entry sary_tab_3_props[] = { + PROPERTY_ENTRY_U32("touchscreen-size-x", 1730), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1151), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-x"), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), + PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-sary-tab-3.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + PROPERTY_ENTRY_BOOL("silead,home-button"), + { } +}; + +static const struct ts_dmi_data sary_tab_3_data = { + .acpi_name = "MSSL1680:00", + .properties = sary_tab_3_props, +}; + static const struct property_entry schneider_sct101ctm_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1715), PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), @@ -1615,6 +1632,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_BIOS_VERSION, "S8A70R100-V005"), }, }, + { + /* SARY Tab 3 */ + .driver_data = (void *)&sary_tab_3_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SARY"), + DMI_MATCH(DMI_PRODUCT_NAME, "C210C"), + DMI_MATCH(DMI_PRODUCT_SKU, "TAB3"), + }, + }, { /* Schneider SCT101CTM */ .driver_data = (void *)&schneider_sct101ctm_data, From 6c0a473fc5f89dabbed0af605a09370b533aa856 Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Tue, 10 Dec 2024 12:31:52 -0800 Subject: [PATCH 174/330] platform/x86/intel/ifs: Add Clearwater Forest to CPU support list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Clearwater Forest (INTEL_ATOM_DARKMONT_X) to the x86 match table of Intel In Field Scan (IFS) driver, enabling IFS functionality on this processor. Signed-off-by: Jithu Joseph Link: https://lore.kernel.org/r/20241210203152.1136463-1-jithu.joseph@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/ifs/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/ifs/core.c b/drivers/platform/x86/intel/ifs/core.c index bc252b883210a..1ae50702bdb76 100644 --- a/drivers/platform/x86/intel/ifs/core.c +++ b/drivers/platform/x86/intel/ifs/core.c @@ -20,6 +20,7 @@ static const struct x86_cpu_id ifs_cpu_ids[] __initconst = { X86_MATCH(INTEL_GRANITERAPIDS_X, ARRAY_GEN0), X86_MATCH(INTEL_GRANITERAPIDS_D, ARRAY_GEN0), X86_MATCH(INTEL_ATOM_CRESTMONT_X, ARRAY_GEN1), + X86_MATCH(INTEL_ATOM_DARKMONT_X, ARRAY_GEN1), {} }; MODULE_DEVICE_TABLE(x86cpu, ifs_cpu_ids); From 83848e37f6ee80f60b04139fefdfa1bde4aaa826 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Tue, 10 Dec 2024 13:26:41 -0800 Subject: [PATCH 175/330] platform/x86/intel/vsec: Add support for Panther Lake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Panther Lake PMT telemetry support. Signed-off-by: Xi Pardee Link: https://lore.kernel.org/r/20241210212646.239211-1-xi.pardee@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 9e0f8e38178c2..e54b6a2a16818 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -423,6 +423,7 @@ static const struct intel_vsec_platform_info lnl_info = { #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d #define PCI_DEVICE_ID_INTEL_VSEC_LNL_M 0x647d +#define PCI_DEVICE_ID_INTEL_VSEC_PTL 0xb07d static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_ADL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_DG1, &dg1_info) }, @@ -432,6 +433,7 @@ static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_LNL_M, &lnl_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_PTL, &mtl_info) }, { } }; MODULE_DEVICE_TABLE(pci, intel_vsec_pci_ids); From d9339496729f1471b8dc326face17c4cf108b027 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Tue, 10 Dec 2024 20:04:15 +0900 Subject: [PATCH 176/330] scripts/kernel-doc: Get -export option working again Since commit cdd30ebb1b9f ("module: Convert symbol namespace to string literal"), exported symbols marked by EXPORT_SYMBOL_NS(_GPL) are ignored by "kernel-doc -export" in fresh build of "make htmldocs". This is because regex in the perl script for those markers fails to match the new signatures: - EXPORT_SYMBOL_NS(symbol, "ns"); - EXPORT_SYMBOL_NS_GPL(symbol, "ns"); Update the regex so that it matches quoted string. Note: Escape sequence of \w is good for C identifiers, but can be too strict for quoted strings. Instead, use \S, which matches any non-whitespace character, for compatibility with possible extension of namespace convention in the future [1]. Fixes: cdd30ebb1b9f ("module: Convert symbol namespace to string literal") Link: https://lore.kernel.org/CAK7LNATMufXP0EA6QUE9hBkZMa6vJO6ZiaYuak2AhOrd2nSVKQ@mail.gmail.com/ [1] Signed-off-by: Akira Yokosawa Cc: Masahiro Yamada Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/e5c43f36-45cd-49f4-b7b8-ff342df3c7a4@gmail.com --- scripts/kernel-doc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index f66070176ba31..4ee843d3600e2 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -267,7 +267,7 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)'; my $doc_inline_end = '^\s*\*/\s*$'; my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$'; my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;'; -my $export_symbol_ns = '^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*\w+\)\s*;'; +my $export_symbol_ns = '^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*;'; my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)}; my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i; From 5fa49dd8e521a42379e5e41fcf2c92edaaec0a8b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 9 Dec 2024 17:43:48 +0100 Subject: [PATCH 177/330] s390/ipl: Fix never less than zero warning DEFINE_IPL_ATTR_STR_RW() macro produces "unsigned 'len' is never less than zero." warning when sys_vmcmd_on_*_store() callbacks are defined. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412081614.5uel8F6W-lkp@intel.com/ Fixes: 247576bf624a ("s390/ipl: Do not accept z/VM CP diag X'008' cmds longer than max length") Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/ipl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index edbb52ce3f1ec..7d12a1305fc99 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -270,7 +270,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ if (len >= sizeof(_value)) \ return -E2BIG; \ len = strscpy(_value, buf, sizeof(_value)); \ - if (len < 0) \ + if ((ssize_t)len < 0) \ return len; \ strim(_value); \ return len; \ From a93a620c38f3763ec74cb0def9625756966f12d9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 4 Dec 2024 18:23:05 -0800 Subject: [PATCH 178/330] perf test expr: Fix system_tsc_freq for only x86 The refactoring of tool PMU events to have a PMU then adding the expr literals to the tool PMU made it so that the literal system_tsc_freq was only supported on x86. Update the test expectations to match - namely the parsing is x86 specific and only yields a non-zero value on Intel. Fixes: 609aa2667f67 ("perf tool_pmu: Switch to standard pmu functions and json descriptions") Reported-by: Athira Rajeev Closes: https://lore.kernel.org/linux-perf-users/20241022140156.98854-1-atrajeev@linux.vnet.ibm.com/ Co-developed-by: Athira Rajeev Tested-by: Namhyung Kim Signed-off-by: Ian Rogers Cc: James Clark Cc: akanksha@linux.ibm.com Cc: hbathini@linux.ibm.com Cc: kjain@linux.ibm.com Cc: maddy@linux.ibm.com Cc: disgoel@linux.vnet.ibm.com Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20241205022305.158202-1-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/tests/expr.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 41ff1affdfcdf..726cf8d4da28d 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -75,14 +75,12 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u double val, num_cpus_online, num_cpus, num_cores, num_dies, num_packages; int ret; struct expr_parse_ctx *ctx; - bool is_intel = false; char strcmp_cpuid_buf[256]; struct perf_cpu cpu = {-1}; char *cpuid = get_cpuid_allow_env_override(cpu); char *escaped_cpuid1, *escaped_cpuid2; TEST_ASSERT_VAL("get_cpuid", cpuid); - is_intel = strstr(cpuid, "Intel") != NULL; TEST_ASSERT_EQUAL("ids_union", test_ids_union(), 0); @@ -245,12 +243,19 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u if (num_dies) // Some platforms do not have CPU die support, for example s390 TEST_ASSERT_VAL("#num_dies >= #num_packages", num_dies >= num_packages); - TEST_ASSERT_VAL("#system_tsc_freq", expr__parse(&val, ctx, "#system_tsc_freq") == 0); - if (is_intel) - TEST_ASSERT_VAL("#system_tsc_freq > 0", val > 0); - else - TEST_ASSERT_VAL("#system_tsc_freq == 0", fpclassify(val) == FP_ZERO); + if (expr__parse(&val, ctx, "#system_tsc_freq") == 0) { + bool is_intel = strstr(cpuid, "Intel") != NULL; + + if (is_intel) + TEST_ASSERT_VAL("#system_tsc_freq > 0", val > 0); + else + TEST_ASSERT_VAL("#system_tsc_freq == 0", fpclassify(val) == FP_ZERO); + } else { +#if defined(__i386__) || defined(__x86_64__) + TEST_ASSERT_VAL("#system_tsc_freq unsupported", 0); +#endif + } /* * Source count returns the number of events aggregating in a leader * event including the leader. Check parsing yields an id. From f7e36d02d771ee14acae1482091718460cffb321 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 14 Nov 2024 16:04:48 +0000 Subject: [PATCH 179/330] libperf: evlist: Fix --cpu argument on hybrid platform Since the linked fixes: commit, specifying a CPU on hybrid platforms results in an error because Perf tries to open an extended type event on "any" CPU which isn't valid. Extended type events can only be opened on CPUs that match the type. Before (working): $ perf record --cpu 1 -- true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 2.385 MB perf.data (7 samples) ] After (not working): $ perf record -C 1 -- true WARNING: A requested CPU in '1' is not supported by PMU 'cpu_atom' (CPUs 16-27) for event 'cycles:P' Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu_atom/cycles:P/). /bin/dmesg | grep -i perf may provide additional information. (Ignore the warning message, that's expected and not particularly relevant to this issue). This is because perf_cpu_map__intersect() of the user specified CPU (1) and one of the PMU's CPUs (16-27) correctly results in an empty (NULL) CPU map. However for the purposes of opening an event, libperf converts empty CPU maps into an any CPU (-1) which the kernel rejects. Fix it by deleting evsels with empty CPU maps in the specific case where user requested CPU maps are evaluated. Fixes: 251aa040244a ("perf parse-events: Wildcard most "numeric" events") Reviewed-by: Ian Rogers Tested-by: Thomas Falcon Signed-off-by: James Clark Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20241114160450.295844-2-james.clark@linaro.org Signed-off-by: Namhyung Kim --- tools/lib/perf/evlist.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index c6d67fc9e57ef..83c43dc13313c 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -47,6 +47,20 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, */ perf_cpu_map__put(evsel->cpus); evsel->cpus = perf_cpu_map__intersect(evlist->user_requested_cpus, evsel->own_cpus); + + /* + * Empty cpu lists would eventually get opened as "any" so remove + * genuinely empty ones before they're opened in the wrong place. + */ + if (perf_cpu_map__is_empty(evsel->cpus)) { + struct perf_evsel *next = perf_evlist__next(evlist, evsel); + + perf_evlist__remove(evlist, evsel); + /* Keep idx contiguous */ + if (next) + list_for_each_entry_from(next, &evlist->entries, node) + next->idx--; + } } else if (!evsel->own_cpus || evlist->has_user_cpus || (!evsel->requires_cpu && perf_cpu_map__has_any_cpu(evlist->user_requested_cpus))) { /* @@ -80,11 +94,11 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, static void perf_evlist__propagate_maps(struct perf_evlist *evlist) { - struct perf_evsel *evsel; + struct perf_evsel *evsel, *n; evlist->needs_map_propagation = true; - perf_evlist__for_each_evsel(evlist, evsel) + list_for_each_entry_safe(evsel, n, &evlist->entries, node) __perf_evlist__propagate_maps(evlist, evsel); } From 7c17f7780a48b5ed36b6d13a06004fac993e75af Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Tue, 26 Nov 2024 13:32:53 +0800 Subject: [PATCH 180/330] ASoC: fsl_xcvr: change IFACE_PCM to IFACE_MIXER As the snd_soc_card_get_kcontrol() is updated to use snd_ctl_find_id_mixer() in commit 897cc72b0837 ("ASoC: soc-card: Use snd_ctl_find_id_mixer() instead of open-coding") which make the iface fix to be IFACE_MIXER. Fixes: 897cc72b0837 ("ASoC: soc-card: Use snd_ctl_find_id_mixer() instead of open-coding") Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/20241126053254.3657344-2-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_xcvr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl_xcvr.c b/sound/soc/fsl/fsl_xcvr.c index 1e0bfd59d5118..9c184ab734688 100644 --- a/sound/soc/fsl/fsl_xcvr.c +++ b/sound/soc/fsl/fsl_xcvr.c @@ -171,7 +171,7 @@ static int fsl_xcvr_capds_put(struct snd_kcontrol *kcontrol, } static struct snd_kcontrol_new fsl_xcvr_earc_capds_kctl = { - .iface = SNDRV_CTL_ELEM_IFACE_PCM, + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Capabilities Data Structure", .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = fsl_xcvr_type_capds_bytes_info, From bb76e82bfe57fdd1fe595cb0ccd33159df49ed09 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Tue, 26 Nov 2024 13:32:54 +0800 Subject: [PATCH 181/330] ASoC: fsl_spdif: change IFACE_PCM to IFACE_MIXER As the snd_soc_card_get_kcontrol() is updated to use snd_ctl_find_id_mixer() in commit 897cc72b0837 ("ASoC: soc-card: Use snd_ctl_find_id_mixer() instead of open-coding") which make the iface fix to be IFACE_MIXER. Fixes: 897cc72b0837 ("ASoC: soc-card: Use snd_ctl_find_id_mixer() instead of open-coding") Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/20241126053254.3657344-3-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_spdif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl_spdif.c b/sound/soc/fsl/fsl_spdif.c index b6ff04f7138a2..ee946e0d3f496 100644 --- a/sound/soc/fsl/fsl_spdif.c +++ b/sound/soc/fsl/fsl_spdif.c @@ -1204,7 +1204,7 @@ static struct snd_kcontrol_new fsl_spdif_ctrls[] = { }, /* DPLL lock info get controller */ { - .iface = SNDRV_CTL_ELEM_IFACE_PCM, + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = RX_SAMPLE_RATE_KCONTROL, .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, From b3134b8c1a1cf5779a74df902c75fa185083006e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sat, 30 Nov 2024 10:33:10 -0500 Subject: [PATCH 182/330] riscv: Fixup boot failure when CONFIG_DEBUG_RT_MUTEXES=y When CONFIG_DEBUG_RT_MUTEXES=y, mutex_lock->rt_mutex_try_acquire would change from rt_mutex_cmpxchg_acquire to rt_mutex_slowtrylock(): raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); Because queued_spin_#ops to ticket_#ops is changed one by one by jump_label, raw_spin_lock/unlock would cause a deadlock during the changing. That means in arch/riscv/kernel/jump_label.c: 1. arch_jump_label_transform_queue() -> mutex_lock(&text_mutex); +-> raw_spin_lock -> queued_spin_lock |-> raw_spin_unlock -> queued_spin_unlock patch_insn_write -> change the raw_spin_lock to ticket_lock mutex_unlock(&text_mutex); ... 2. /* Dirty the lock value */ arch_jump_label_transform_queue() -> mutex_lock(&text_mutex); +-> raw_spin_lock -> *ticket_lock* |-> raw_spin_unlock -> *queued_spin_unlock* /* BUG: ticket_lock with queued_spin_unlock */ patch_insn_write -> change the raw_spin_unlock to ticket_unlock mutex_unlock(&text_mutex); ... 3. /* Dead lock */ arch_jump_label_transform_queue() -> mutex_lock(&text_mutex); +-> raw_spin_lock -> ticket_lock /* deadlock! */ |-> raw_spin_unlock -> ticket_unlock patch_insn_write -> change other raw_spin_#op -> ticket_#op mutex_unlock(&text_mutex); So, the solution is to disable mutex usage of arch_jump_label_transform_queue() during early_boot_irqs_disabled, just like we have done for stop_machine. Reported-by: Conor Dooley Signed-off-by: Guo Ren Signed-off-by: Guo Ren Fixes: ab83647fadae ("riscv: Add qspinlock support") Link: https://lore.kernel.org/linux-riscv/CAJF2gTQwYTGinBmCSgVUoPv0_q4EPt_+WiyfUA1HViAKgUzxAg@mail.gmail.com/T/#mf488e6347817fca03bb93a7d34df33d8615b3775 Cc: Palmer Dabbelt Cc: Alexandre Ghiti Tested-by: Conor Dooley Reviewed-by: Alexandre Ghiti Tested-by: Alexandre Ghiti Tested-by: Nam Cao Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241130153310.3349484-1-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/jump_label.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c index 6eee6f736f687..654ed159c830b 100644 --- a/arch/riscv/kernel/jump_label.c +++ b/arch/riscv/kernel/jump_label.c @@ -36,9 +36,15 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, insn = RISCV_INSN_NOP; } - mutex_lock(&text_mutex); - patch_insn_write(addr, &insn, sizeof(insn)); - mutex_unlock(&text_mutex); + if (early_boot_irqs_disabled) { + riscv_patch_in_stop_machine = 1; + patch_insn_write(addr, &insn, sizeof(insn)); + riscv_patch_in_stop_machine = 0; + } else { + mutex_lock(&text_mutex); + patch_insn_write(addr, &insn, sizeof(insn)); + mutex_unlock(&text_mutex); + } return true; } From c796e187201242992d6d292bfeff41aadfdf3f29 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 9 Dec 2024 08:45:08 +0100 Subject: [PATCH 183/330] riscv: Fix wrong usage of __pa() on a fixmap address riscv uses fixmap addresses to map the dtb so we can't use __pa() which is reserved for linear mapping addresses. Fixes: b2473a359763 ("of/fdt: add dt_phys arg to early_init_dt_scan and early_init_dt_verify") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20241209074508.53037-1-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 016b48fcd6f27..45010e71df86c 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -227,7 +227,7 @@ static void __init init_resources(void) static void __init parse_dtb(void) { /* Early scan of device tree from init memory */ - if (early_init_dt_scan(dtb_early_va, __pa(dtb_early_va))) { + if (early_init_dt_scan(dtb_early_va, dtb_early_pa)) { const char *name = of_flat_dt_get_machine_name(); if (name) { From b3431a8bb336cece8adc452437befa7d4534b2fd Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 9 Dec 2024 08:41:25 +0100 Subject: [PATCH 184/330] riscv: Fix IPIs usage in kfence_protect_page() flush_tlb_kernel_range() may use IPIs to flush the TLBs of all the cores, which triggers the following warning when the irqs are disabled: [ 3.455330] WARNING: CPU: 1 PID: 0 at kernel/smp.c:815 smp_call_function_many_cond+0x452/0x520 [ 3.456647] Modules linked in: [ 3.457218] CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Not tainted 6.12.0-rc7-00010-g91d3de7240b8 #1 [ 3.457416] Hardware name: QEMU QEMU Virtual Machine, BIOS [ 3.457633] epc : smp_call_function_many_cond+0x452/0x520 [ 3.457736] ra : on_each_cpu_cond_mask+0x1e/0x30 [ 3.457786] epc : ffffffff800b669a ra : ffffffff800b67c2 sp : ff2000000000bb50 [ 3.457824] gp : ffffffff815212b8 tp : ff6000008014f080 t0 : 000000000000003f [ 3.457859] t1 : ffffffff815221e0 t2 : 000000000000000f s0 : ff2000000000bc10 [ 3.457920] s1 : 0000000000000040 a0 : ffffffff815221e0 a1 : 0000000000000001 [ 3.457953] a2 : 0000000000010000 a3 : 0000000000000003 a4 : 0000000000000000 [ 3.458006] a5 : 0000000000000000 a6 : ffffffffffffffff a7 : 0000000000000000 [ 3.458042] s2 : ffffffff815223be s3 : 00fffffffffff000 s4 : ff600001ffe38fc0 [ 3.458076] s5 : ff600001ff950d00 s6 : 0000000200000120 s7 : 0000000000000001 [ 3.458109] s8 : 0000000000000001 s9 : ff60000080841ef0 s10: 0000000000000001 [ 3.458141] s11: ffffffff81524812 t3 : 0000000000000001 t4 : ff60000080092bc0 [ 3.458172] t5 : 0000000000000000 t6 : ff200000000236d0 [ 3.458203] status: 0000000200000100 badaddr: ffffffff800b669a cause: 0000000000000003 [ 3.458373] [] smp_call_function_many_cond+0x452/0x520 [ 3.458593] [] on_each_cpu_cond_mask+0x1e/0x30 [ 3.458625] [] __flush_tlb_range+0x118/0x1ca [ 3.458656] [] flush_tlb_kernel_range+0x1e/0x26 [ 3.458683] [] kfence_protect+0xc0/0xce [ 3.458717] [] kfence_guarded_free+0xc6/0x1c0 [ 3.458742] [] __kfence_free+0x62/0xc6 [ 3.458764] [] kfree+0x106/0x32c [ 3.458786] [] detach_buf_split+0x188/0x1a8 [ 3.458816] [] virtqueue_get_buf_ctx+0xb6/0x1f6 [ 3.458839] [] virtqueue_get_buf+0xe/0x16 [ 3.458880] [] virtblk_done+0x5c/0xe2 [ 3.458908] [] vring_interrupt+0x6a/0x74 [ 3.458930] [] __handle_irq_event_percpu+0x7c/0xe2 [ 3.458956] [] handle_irq_event+0x3c/0x86 [ 3.458978] [] handle_simple_irq+0x9e/0xbe [ 3.459004] [] generic_handle_domain_irq+0x1c/0x2a [ 3.459027] [] imsic_handle_irq+0xba/0x120 [ 3.459056] [] generic_handle_domain_irq+0x1c/0x2a [ 3.459080] [] riscv_intc_aia_irq+0x24/0x34 [ 3.459103] [] handle_riscv_irq+0x2e/0x4c [ 3.459133] [] call_on_irq_stack+0x32/0x40 So only flush the local TLB and let the lazy kfence page fault handling deal with the faults which could happen when a core has an old protected pte version cached in its TLB. That leads to potential inaccuracies which can be tolerated when using kfence. Fixes: 47513f243b45 ("riscv: Enable KFENCE for riscv64") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241209074125.52322-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kfence.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/kfence.h b/arch/riscv/include/asm/kfence.h index 7388edd88986f..d08bf7fb3aee6 100644 --- a/arch/riscv/include/asm/kfence.h +++ b/arch/riscv/include/asm/kfence.h @@ -22,7 +22,9 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) else set_pte(pte, __pte(pte_val(ptep_get(pte)) | _PAGE_PRESENT)); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + preempt_disable(); + local_flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + preempt_enable(); return true; } From 21f1b85c8912262adf51707e63614a114425eb10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 20 Nov 2024 14:12:02 +0100 Subject: [PATCH 185/330] riscv: mm: Do not call pmd dtor on vmemmap page table teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vmemmap's, which is used for RV64 with SPARSEMEM_VMEMMAP, page tables are populated using pmd (page middle directory) hugetables. However, the pmd allocation is not using the generic mechanism used by the VMA code (e.g. pmd_alloc()), or the RISC-V specific create_pgd_mapping()/alloc_pmd_late(). Instead, the vmemmap page table code allocates a page, and calls vmemmap_set_pmd(). This results in that the pmd ctor is *not* called, nor would it make sense to do so. Now, when tearing down a vmemmap page table pmd, the cleanup code would unconditionally, and incorrectly call the pmd dtor, which results in a crash (best case). This issue was found when running the HMM selftests: | tools/testing/selftests/mm# ./test_hmm.sh smoke | ... # when unloading the test_hmm.ko module | page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10915b | flags: 0x1000000000000000(node=0|zone=1) | raw: 1000000000000000 0000000000000000 dead000000000122 0000000000000000 | raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 | page dumped because: VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte) | ------------[ cut here ]------------ | kernel BUG at include/linux/mm.h:3080! | Kernel BUG [#1] | Modules linked in: test_hmm(-) sch_fq_codel fuse drm drm_panel_orientation_quirks backlight dm_mod | CPU: 1 UID: 0 PID: 514 Comm: modprobe Tainted: G W 6.12.0-00982-gf2a4f1682d07 #2 | Tainted: [W]=WARN | Hardware name: riscv-virtio qemu/qemu, BIOS 2024.10 10/01/2024 | epc : remove_pgd_mapping+0xbec/0x1070 | ra : remove_pgd_mapping+0xbec/0x1070 | epc : ffffffff80010a68 ra : ffffffff80010a68 sp : ff20000000a73940 | gp : ffffffff827b2d88 tp : ff6000008785da40 t0 : ffffffff80fbce04 | t1 : 0720072007200720 t2 : 706d756420656761 s0 : ff20000000a73a50 | s1 : ff6000008915cff8 a0 : 0000000000000039 a1 : 0000000000000008 | a2 : ff600003fff0de20 a3 : 0000000000000000 a4 : 0000000000000000 | a5 : 0000000000000000 a6 : c0000000ffffefff a7 : ffffffff824469b8 | s2 : ff1c0000022456c0 s3 : ff1ffffffdbfffff s4 : ff6000008915c000 | s5 : ff6000008915c000 s6 : ff6000008915c000 s7 : ff1ffffffdc00000 | s8 : 0000000000000001 s9 : ff1ffffffdc00000 s10: ffffffff819a31f0 | s11: ffffffffffffffff t3 : ffffffff8000c950 t4 : ff60000080244f00 | t5 : ff60000080244000 t6 : ff20000000a73708 | status: 0000000200000120 badaddr: ffffffff80010a68 cause: 0000000000000003 | [] remove_pgd_mapping+0xbec/0x1070 | [] vmemmap_free+0x14/0x1e | [] section_deactivate+0x220/0x452 | [] sparse_remove_section+0x4a/0x58 | [] __remove_pages+0x7e/0xba | [] memunmap_pages+0x2bc/0x3fe | [] dmirror_device_remove_chunks+0x2ea/0x518 [test_hmm] | [] hmm_dmirror_exit+0x3e/0x1018 [test_hmm] | [] __riscv_sys_delete_module+0x15a/0x2a6 | [] do_trap_ecall_u+0x1f2/0x266 | [] _new_vmalloc_restore_context_a0+0xc6/0xd2 | Code: bf51 7597 0184 8593 76a5 854a 4097 0029 80e7 2c00 (9002) 7597 | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: Fatal exception in interrupt Add a check to avoid calling the pmd dtor, if the calling context is vmemmap_free(). Fixes: c75a74f4ba19 ("riscv: mm: Add memory hotplugging support") Signed-off-by: Björn Töpel Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20241120131203.1859787-1-bjorn@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0e8c20adcd98d..fc53ce748c804 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1566,7 +1566,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) pmd_clear(pmd); } -static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool is_vmemmap) { struct page *page = pud_page(*pud); struct ptdesc *ptdesc = page_ptdesc(page); @@ -1579,7 +1579,8 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) return; } - pagetable_pmd_dtor(ptdesc); + if (!is_vmemmap) + pagetable_pmd_dtor(ptdesc); if (PageReserved(page)) free_reserved_page(page); else @@ -1703,7 +1704,7 @@ static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, un remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap); if (pgtable_l4_enabled) - free_pmd_table(pmd_base, pudp); + free_pmd_table(pmd_base, pudp, is_vmemmap); } } From 747367340ca6b5070728b86ae36ad6747f66b2fb Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 11 Dec 2024 12:07:42 +0100 Subject: [PATCH 186/330] EDAC/amd64: Simplify ECC check on unified memory controllers The intent of the check is to see whether at least one UMC has ECC enabled. So do that instead of tracking which ones are enabled in masks which are too small in size anyway and lead to not loading the driver on Zen4 machines with UMCs enabled over UMC8. Fixes: e2be5955a886 ("EDAC/amd64: Add support for AMD Family 19h Models 10h-1Fh and A0h-AFh") Reported-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Tested-by: Avadhut Naik Reviewed-by: Avadhut Naik Cc: Link: https://lore.kernel.org/r/20241210212054.3895697-1-avadhut.naik@amd.com --- drivers/edac/amd64_edac.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ddfbdb66b794d..5d356b7c45897 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3362,36 +3362,24 @@ static bool dct_ecc_enabled(struct amd64_pvt *pvt) static bool umc_ecc_enabled(struct amd64_pvt *pvt) { - u8 umc_en_mask = 0, ecc_en_mask = 0; - u16 nid = pvt->mc_node_id; struct amd64_umc *umc; - u8 ecc_en = 0, i; + bool ecc_en = false; + int i; + /* Check whether at least one UMC is enabled: */ for_each_umc(i) { umc = &pvt->umc[i]; - /* Only check enabled UMCs. */ - if (!(umc->sdp_ctrl & UMC_SDP_INIT)) - continue; - - umc_en_mask |= BIT(i); - - if (umc->umc_cap_hi & UMC_ECC_ENABLED) - ecc_en_mask |= BIT(i); + if (umc->sdp_ctrl & UMC_SDP_INIT && + umc->umc_cap_hi & UMC_ECC_ENABLED) { + ecc_en = true; + break; + } } - /* Check whether at least one UMC is enabled: */ - if (umc_en_mask) - ecc_en = umc_en_mask == ecc_en_mask; - else - edac_dbg(0, "Node %d: No enabled UMCs.\n", nid); - - edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, (ecc_en ? "enabled" : "disabled")); - if (!ecc_en) - return false; - else - return true; + return ecc_en; } static inline void From 434fffa926b10706f2bde2db22979d68463302fc Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 11 Dec 2024 08:55:23 +0000 Subject: [PATCH 187/330] perf probe: Fix uninitialized variable Since the linked fixes: commit, err is returned uninitialized due to the removal of "return 0". Initialize err to fix it. This fixes the following intermittent test failure on release builds: $ perf test "testsuite_probe" ... -- [ FAIL ] -- perf_probe :: test_invalid_options :: mutually exclusive options :: -L foo -V bar (output regexp parsing) Regexp not found: \"Error: switch .+ cannot be used with switch .+\" ... Fixes: 080e47b2a237 ("perf probe: Introduce quotation marks support") Tested-by: Namhyung Kim Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Masami Hiramatsu (Google) Signed-off-by: James Clark Link: https://lore.kernel.org/r/20241211085525.519458-2-james.clark@linaro.org Signed-off-by: Namhyung Kim --- tools/perf/util/probe-event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 6d51a4c98ad7c..eaa0318e9b87e 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1370,7 +1370,7 @@ int parse_line_range_desc(const char *arg, struct line_range *lr) { char *buf = strdup(arg); char *p; - int err; + int err = 0; if (!buf) return -ENOMEM; From 255cc582e6e16191a20d54bcdbca6c91d3e90c5e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 12 Dec 2024 10:57:42 +0000 Subject: [PATCH 188/330] ASoC: Intel: sof_sdw: Add space for a terminator into DAIs array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code uses the initialised member of the asoc_sdw_dailink struct to determine if a member of the array is in use. However in the case the array is completely full this will lead to an access 1 past the end of the array, expand the array by one entry to include a space for a terminator. Fixes: 27fd36aefa00 ("ASoC: Intel: sof-sdw: Add new code for parsing the snd_soc_acpi structs") Reviewed-by: Bard Liao Reviewed-by: Péter Ujfalusi Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20241212105742.1508574-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 810be7c949a5c..f3369e569abc4 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -1067,8 +1067,12 @@ static int sof_card_dai_links_create(struct snd_soc_card *card) return ret; } - /* One per DAI link, worst case is a DAI link for every endpoint */ - sof_dais = kcalloc(num_ends, sizeof(*sof_dais), GFP_KERNEL); + /* + * One per DAI link, worst case is a DAI link for every endpoint, also + * add one additional to act as a terminator such that code can iterate + * until it hits an uninitialised DAI. + */ + sof_dais = kcalloc(num_ends + 1, sizeof(*sof_dais), GFP_KERNEL); if (!sof_dais) return -ENOMEM; From de6b43798d9043a7c749a0428dbb02d5fff156e5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 22 Nov 2024 15:14:35 +0100 Subject: [PATCH 189/330] i2c: riic: Always round-up when calculating bus period Currently, the RIIC driver may run the I2C bus faster than requested, which may cause subtle failures. E.g. Biju reported a measured bus speed of 450 kHz instead of the expected maximum of 400 kHz on RZ/G2L. The initial calculation of the bus period uses DIV_ROUND_UP(), to make sure the actual bus speed never becomes faster than the requested bus speed. However, the subsequent division-by-two steps do not use round-up, which may lead to a too-small period, hence a too-fast and possible out-of-spec bus speed. E.g. on RZ/Five, requesting a bus speed of 100 resp. 400 kHz will yield too-fast target bus speeds of 100806 resp. 403226 Hz instead of 97656 resp. 390625 Hz. Fix this by using DIV_ROUND_UP() in the subsequent divisions, too. Tested on RZ/A1H, RZ/A2M, and RZ/Five. Fixes: d982d66514192cdb ("i2c: riic: remove clock and frequency restrictions") Reported-by: Biju Das Signed-off-by: Geert Uytterhoeven Cc: # v4.15+ Link: https://lore.kernel.org/r/c59aea77998dfea1b4456c4b33b55ab216fcbf5e.1732284746.git.geert+renesas@glider.be Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-riic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c index c218f73c36509..9264adc97ca9d 100644 --- a/drivers/i2c/busses/i2c-riic.c +++ b/drivers/i2c/busses/i2c-riic.c @@ -352,7 +352,7 @@ static int riic_init_hw(struct riic_dev *riic) if (brl <= (0x1F + 3)) break; - total_ticks /= 2; + total_ticks = DIV_ROUND_UP(total_ticks, 2); rate /= 2; } From c53d96a4481f42a1635b96d2c1acbb0a126bfd54 Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Fri, 22 Nov 2024 11:29:54 +0300 Subject: [PATCH 190/330] ACPICA: events/evxfregn: don't release the ContextMutex that was never acquired This bug was first introduced in c27f3d011b08, where the author of the patch probably meant to do DeleteMutex instead of ReleaseMutex. The mutex leak was noticed later on and fixed in e4dfe108371, but the bogus MutexRelease line was never removed, so do it now. Link: https://github.com/acpica/acpica/pull/982 Fixes: c27f3d011b08 ("ACPICA: Fix race in generic_serial_bus (I2C) and GPIO op_region parameter handling") Signed-off-by: Daniil Tatianin Link: https://patch.msgid.link/20241122082954.658356-1-d-tatianin@yandex-team.ru Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxfregn.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 95f78383bbdba..bff2d099f4691 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -232,8 +232,6 @@ acpi_remove_address_space_handler(acpi_handle device, /* Now we can delete the handler object */ - acpi_os_release_mutex(handler_obj->address_space. - context_mutex); acpi_ut_remove_reference(handler_obj); goto unlock_and_exit; } From 99d6af6e8a22b792e1845b186f943cd10bb4b7b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Dec 2024 08:01:52 -0700 Subject: [PATCH 191/330] io_uring/rsrc: don't put/free empty buffers If cloning of buffers fail and we have to put the ones already grabbed, check for NULL buffers and skip those. They used to be dummy ubufs, but now they are just NULL and that should be checked before reaping them. Reported-by: chase xd Link: https://lore.kernel.org/io-uring/CADZouDQ7TcKn8gz8_efnyAEp1JvU1ktRk8PWz-tO0FXUoh8VGQ@mail.gmail.com/ Fixes: d50f94d761a5 ("io_uring/rsrc: get rid of the empty node and dummy_ubuf") Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index adaae86309322..077f84684c18a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1036,8 +1036,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx out_put_free: i = data.nr; while (i--) { - io_buffer_unmap(src_ctx, data.nodes[i]); - kfree(data.nodes[i]); + if (data.nodes[i]) { + io_buffer_unmap(src_ctx, data.nodes[i]); + kfree(data.nodes[i]); + } } out_unlock: io_rsrc_data_free(ctx, &data); From 2f4873f9b5f8a49113045ad91c021347486de323 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:57:27 +0000 Subject: [PATCH 192/330] block: Make bio_iov_bvec_set() accept pointer to const iov_iter Make bio_iov_bvec_set() accept a pointer to const iov_iter, which means that we can drop the undesirable casting to struct iov_iter pointer in blk_rq_map_user_bvec(). Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241202115727.2320401-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-map.c | 2 +- include/linux/bio.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index 699a78c85c756..d5bdc31d88d32 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1171,7 +1171,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(__bio_release_pages); -void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); diff --git a/block/blk-map.c b/block/blk-map.c index b5fd1d8574615..894009b2d881b 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -574,7 +574,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); if (!bio) return -ENOMEM; - bio_iov_bvec_set(bio, (struct iov_iter *)iter); + bio_iov_bvec_set(bio, iter); /* check that the data layout matches the hardware restrictions */ ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes); diff --git a/include/linux/bio.h b/include/linux/bio.h index 60830a6a59394..7a1b3b1a8fed0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -423,7 +423,7 @@ void __bio_add_page(struct bio *bio, struct page *page, void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); From ed69b28b3a5e39871ba5599992f80562d6ee59db Mon Sep 17 00:00:00 2001 From: Mirsad Todorovac Date: Thu, 21 Nov 2024 22:20:58 +0100 Subject: [PATCH 193/330] drm/xe: fix the ERR_PTR() returned on failure to allocate tiny pt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running coccinelle spatch gave the following warning: ./drivers/gpu/drm/xe/tests/xe_migrate.c:226:5-11: inconsistent IS_ERR and PTR_ERR on line 228. The code reports PTR_ERR(pt) when IS_ERR(tiny) is checked: → 211 pt = xe_bo_create_pin_map(xe, tile, m->q->vm, XE_PAGE_SIZE, 212 ttm_bo_type_kernel, 213 XE_BO_FLAG_VRAM_IF_DGFX(tile) | 214 XE_BO_FLAG_PINNED); 215 if (IS_ERR(pt)) { 216 KUNIT_FAIL(test, "Failed to allocate fake pt: %li\n", 217 PTR_ERR(pt)); 218 goto free_big; 219 } 220 221 tiny = xe_bo_create_pin_map(xe, tile, m->q->vm, → 222 2 * SZ_4K, 223 ttm_bo_type_kernel, 224 XE_BO_FLAG_VRAM_IF_DGFX(tile) | 225 XE_BO_FLAG_PINNED); → 226 if (IS_ERR(tiny)) { → 227 KUNIT_FAIL(test, "Failed to allocate fake pt: %li\n", → 228 PTR_ERR(pt)); 229 goto free_pt; 230 } Now, the IS_ERR(tiny) and the corresponding PTR_ERR(pt) do not match. Returning PTR_ERR(tiny), as the last failed function call, seems logical. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Mirsad Todorovac Link: https://patchwork.freedesktop.org/patch/msgid/20241121212057.1526634-2-mtodorovac69@gmail.com Signed-off-by: Rodrigo Vivi (cherry picked from commit cb57c75098c1c449a007ba301f9073f96febaaa9) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/tests/xe_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c index 1a192a2a941b6..3bbdb362d6f0d 100644 --- a/drivers/gpu/drm/xe/tests/xe_migrate.c +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c @@ -224,8 +224,8 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test) XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_PINNED); if (IS_ERR(tiny)) { - KUNIT_FAIL(test, "Failed to allocate fake pt: %li\n", - PTR_ERR(pt)); + KUNIT_FAIL(test, "Failed to allocate tiny fake pt: %li\n", + PTR_ERR(tiny)); goto free_pt; } From cefade70f346160f47cc24776160329e2ee63653 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Thu, 5 Dec 2024 17:50:22 -0800 Subject: [PATCH 194/330] drm/xe: Call invalidation_fence_fini for PT inval fences in error state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Invalidation_fence_init takes a PM reference, which is released in its _fini counterpart, so we need to make sure that the latter is called, even if the fence is in an error state. Since we already have a function that calls _fini() and signals the fence in the tlb inval code, we can expose that and call it from the PT code. Fixes: f002702290fc ("drm/xe: Hold a PM ref when GT TLB invalidations are inflight") Signed-off-by: Daniele Ceraolo Spurio Cc: # v6.11+ Cc: Matthew Brost Cc: Nirmoy Das Cc: Rodrigo Vivi Reviewed-by: Nirmoy Das Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241206015022.1567113-1-daniele.ceraolospurio@intel.com (cherry picked from commit 65338639b79ce88aef5263cd518cde570a3c7c8e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 8 ++++++++ drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 1 + drivers/gpu/drm/xe/xe_pt.c | 3 +-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 3cb228c773cd4..6146d1776bdac 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -65,6 +65,14 @@ invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fe __invalidation_fence_signal(xe, fence); } +void xe_gt_tlb_invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence) +{ + if (WARN_ON_ONCE(!fence->gt)) + return; + + __invalidation_fence_signal(gt_to_xe(fence->gt), fence); +} + static void xe_gt_tlb_fence_timeout(struct work_struct *work) { struct xe_gt *gt = container_of(work, struct xe_gt, diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index f430d5797af70..00b1c6c01e8d9 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -28,6 +28,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, bool stack); +void xe_gt_tlb_invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence); static inline void xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence) diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index f27f579f4d85a..797576690356f 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1333,8 +1333,7 @@ static void invalidation_fence_cb(struct dma_fence *fence, queue_work(system_wq, &ifence->work); } else { ifence->base.base.error = ifence->fence->error; - dma_fence_signal(&ifence->base.base); - dma_fence_put(&ifence->base.base); + xe_gt_tlb_invalidation_fence_signal(&ifence->base); } dma_fence_put(ifence->fence); } From d7b028656c29b22fcde1c6ee1df5b28fbba987b5 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Mon, 9 Dec 2024 15:27:35 -0800 Subject: [PATCH 195/330] drm/xe/reg_sr: Remove register pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That pool implementation doesn't really work: if the krealloc happens to move the memory and return another address, the entries in the xarray become invalid, leading to use-after-free later: BUG: KASAN: slab-use-after-free in xe_reg_sr_apply_mmio+0x570/0x760 [xe] Read of size 4 at addr ffff8881244b2590 by task modprobe/2753 Allocated by task 2753: kasan_save_stack+0x39/0x70 kasan_save_track+0x14/0x40 kasan_save_alloc_info+0x37/0x60 __kasan_kmalloc+0xc3/0xd0 __kmalloc_node_track_caller_noprof+0x200/0x6d0 krealloc_noprof+0x229/0x380 Simplify the code to fix the bug. A better pooling strategy may be added back later if needed. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20241209232739.147417-2-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit e5283bd4dfecbd3335f43b62a68e24dae23f59e4) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_reg_sr.c | 31 ++++++---------------------- drivers/gpu/drm/xe/xe_reg_sr_types.h | 6 ------ 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c b/drivers/gpu/drm/xe/xe_reg_sr.c index e1a0e27cda14c..c13123008e903 100644 --- a/drivers/gpu/drm/xe/xe_reg_sr.c +++ b/drivers/gpu/drm/xe/xe_reg_sr.c @@ -27,46 +27,27 @@ #include "xe_reg_whitelist.h" #include "xe_rtp_types.h" -#define XE_REG_SR_GROW_STEP_DEFAULT 16 - static void reg_sr_fini(struct drm_device *drm, void *arg) { struct xe_reg_sr *sr = arg; + struct xe_reg_sr_entry *entry; + unsigned long reg; + + xa_for_each(&sr->xa, reg, entry) + kfree(entry); xa_destroy(&sr->xa); - kfree(sr->pool.arr); - memset(&sr->pool, 0, sizeof(sr->pool)); } int xe_reg_sr_init(struct xe_reg_sr *sr, const char *name, struct xe_device *xe) { xa_init(&sr->xa); - memset(&sr->pool, 0, sizeof(sr->pool)); - sr->pool.grow_step = XE_REG_SR_GROW_STEP_DEFAULT; sr->name = name; return drmm_add_action_or_reset(&xe->drm, reg_sr_fini, sr); } EXPORT_SYMBOL_IF_KUNIT(xe_reg_sr_init); -static struct xe_reg_sr_entry *alloc_entry(struct xe_reg_sr *sr) -{ - if (sr->pool.used == sr->pool.allocated) { - struct xe_reg_sr_entry *arr; - - arr = krealloc_array(sr->pool.arr, - ALIGN(sr->pool.allocated + 1, sr->pool.grow_step), - sizeof(*arr), GFP_KERNEL); - if (!arr) - return NULL; - - sr->pool.arr = arr; - sr->pool.allocated += sr->pool.grow_step; - } - - return &sr->pool.arr[sr->pool.used++]; -} - static bool compatible_entries(const struct xe_reg_sr_entry *e1, const struct xe_reg_sr_entry *e2) { @@ -112,7 +93,7 @@ int xe_reg_sr_add(struct xe_reg_sr *sr, return 0; } - pentry = alloc_entry(sr); + pentry = kmalloc(sizeof(*pentry), GFP_KERNEL); if (!pentry) { ret = -ENOMEM; goto fail; diff --git a/drivers/gpu/drm/xe/xe_reg_sr_types.h b/drivers/gpu/drm/xe/xe_reg_sr_types.h index ad48a52b824a1..ebe11f237fa26 100644 --- a/drivers/gpu/drm/xe/xe_reg_sr_types.h +++ b/drivers/gpu/drm/xe/xe_reg_sr_types.h @@ -20,12 +20,6 @@ struct xe_reg_sr_entry { }; struct xe_reg_sr { - struct { - struct xe_reg_sr_entry *arr; - unsigned int used; - unsigned int allocated; - unsigned int grow_step; - } pool; struct xarray xa; const char *name; From 32ed1205682ec42ad51e55b239a190d55476aeb8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 11 Dec 2024 14:07:03 +0000 Subject: [PATCH 196/330] arm64: stacktrace: Skip reporting LR at exception boundaries Aishwarya reports that warnings are sometimes seen when running the ftrace kselftests, e.g. | WARNING: CPU: 5 PID: 2066 at arch/arm64/kernel/stacktrace.c:141 arch_stack_walk+0x4a0/0x4c0 | Modules linked in: | CPU: 5 UID: 0 PID: 2066 Comm: ftracetest Not tainted 6.13.0-rc2 #2 | Hardware name: linux,dummy-virt (DT) | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : arch_stack_walk+0x4a0/0x4c0 | lr : arch_stack_walk+0x248/0x4c0 | sp : ffff800083643d20 | x29: ffff800083643dd0 x28: ffff00007b891400 x27: ffff00007b891928 | x26: 0000000000000001 x25: 00000000000000c0 x24: ffff800082f39d80 | x23: ffff80008003ee8c x22: ffff80008004baa8 x21: ffff8000800533e0 | x20: ffff800083643e10 x19: ffff80008003eec8 x18: 0000000000000000 | x17: 0000000000000000 x16: ffff800083640000 x15: 0000000000000000 | x14: 02a37a802bbb8a92 x13: 00000000000001a9 x12: 0000000000000001 | x11: ffff800082ffad60 x10: ffff800083643d20 x9 : ffff80008003eed0 | x8 : ffff80008004baa8 x7 : ffff800086f2be80 x6 : ffff0000057cf000 | x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff800086f2b690 | x2 : ffff80008004baa8 x1 : ffff80008004baa8 x0 : ffff80008004baa8 | Call trace: | arch_stack_walk+0x4a0/0x4c0 (P) | arch_stack_walk+0x248/0x4c0 (L) | profile_pc+0x44/0x80 | profile_tick+0x50/0x80 (F) | tick_nohz_handler+0xcc/0x160 (F) | __hrtimer_run_queues+0x2ac/0x340 (F) | hrtimer_interrupt+0xf4/0x268 (F) | arch_timer_handler_virt+0x34/0x60 (F) | handle_percpu_devid_irq+0x88/0x220 (F) | generic_handle_domain_irq+0x34/0x60 (F) | gic_handle_irq+0x54/0x140 (F) | call_on_irq_stack+0x24/0x58 (F) | do_interrupt_handler+0x88/0x98 | el1_interrupt+0x34/0x68 (F) | el1h_64_irq_handler+0x18/0x28 | el1h_64_irq+0x6c/0x70 | queued_spin_lock_slowpath+0x78/0x460 (P) The warning in question is: WARN_ON_ONCE(state->common.pc == orig_pc)) ... in kunwind_recover_return_address(), which is triggered when return_to_handler() is encountered in the trace, but ftrace_graph_ret_addr() cannot find a corresponding original return address on the fgraph return stack. This happens because the stacktrace code encounters an exception boundary where the LR was not live at the time of the exception, but the LR happens to contain return_to_handler(); either because the task recently returned there, or due to unfortunate usage of the LR at a scratch register. In such cases attempts to recover the return address via ftrace_graph_ret_addr() may fail, triggering the WARN_ON_ONCE() above and aborting the unwind (hence the stacktrace terminating after reporting the PC at the time of the exception). Handling unreliable LR values in these cases is likely to require some larger rework, so for the moment avoid this problem by restoring the old behaviour of skipping the LR at exception boundaries, which the stacktrace code did prior to commit: c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries") This commit is effectively a partial revert, keeping the structures and logic to explicitly identify exception boundaries while still skipping reporting of the LR. The logic to explicitly identify exception boundaries is still useful for general robustness and as a building block for future support for RELIABLE_STACKTRACE. Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries") Signed-off-by: Mark Rutland Reported-by: Aishwarya TCV Cc: Will Deacon Link: https://lore.kernel.org/r/20241211140704.2498712-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index caef85462acb6..4a08ad8158380 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -26,7 +26,6 @@ enum kunwind_source { KUNWIND_SOURCE_CALLER, KUNWIND_SOURCE_TASK, KUNWIND_SOURCE_REGS_PC, - KUNWIND_SOURCE_REGS_LR, }; union unwind_flags { @@ -178,23 +177,8 @@ int kunwind_next_regs_pc(struct kunwind_state *state) state->regs = regs; state->common.pc = regs->pc; state->common.fp = regs->regs[29]; - state->source = KUNWIND_SOURCE_REGS_PC; - return 0; -} - -static __always_inline int -kunwind_next_regs_lr(struct kunwind_state *state) -{ - /* - * The stack for the regs was consumed by kunwind_next_regs_pc(), so we - * cannot consume that again here, but we know the regs are safe to - * access. - */ - state->common.pc = state->regs->regs[30]; - state->common.fp = state->regs->regs[29]; state->regs = NULL; - state->source = KUNWIND_SOURCE_REGS_LR; - + state->source = KUNWIND_SOURCE_REGS_PC; return 0; } @@ -274,11 +258,8 @@ kunwind_next(struct kunwind_state *state) case KUNWIND_SOURCE_FRAME: case KUNWIND_SOURCE_CALLER: case KUNWIND_SOURCE_TASK: - case KUNWIND_SOURCE_REGS_LR: - err = kunwind_next_frame_record(state); - break; case KUNWIND_SOURCE_REGS_PC: - err = kunwind_next_regs_lr(state); + err = kunwind_next_frame_record(state); break; default: err = -EINVAL; @@ -436,7 +417,6 @@ static const char *state_source_string(const struct kunwind_state *state) case KUNWIND_SOURCE_CALLER: return "C"; case KUNWIND_SOURCE_TASK: return "T"; case KUNWIND_SOURCE_REGS_PC: return "P"; - case KUNWIND_SOURCE_REGS_LR: return "L"; default: return "U"; } } From 65ac33bed8b9255213b08ed153dbe9c0ca3535e6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 11 Dec 2024 14:07:04 +0000 Subject: [PATCH 197/330] arm64: stacktrace: Don't WARN when unwinding other tasks The arm64 stacktrace code has a few error conditions where a WARN_ON_ONCE() is triggered before the stacktrace is terminated and an error is returned to the caller. The conditions shouldn't be triggered when unwinding the current task, but it is possible to trigger these when unwinding another task which is not blocked, as the stack of that task is concurrently modified. Kent reports that these warnings can be triggered while running filesystem tests on bcachefs, which calls the stacktrace code directly. To produce a meaningful stacktrace of another task, the task in question should be blocked, but the stacktrace code is expected to be robust to cases where it is not blocked. Note that this is purely about not unuduly scaring the user and/or crashing the kernel; stacktraces in such cases are meaningless and may leak kernel secrets from the stack of the task being unwound. Ideally we'd pin the task in a blocked state during the unwind, as we do for /proc/${PID}/wchan since commit: 42a20f86dc19f928 ("sched: Add wrapper for get_wchan() to keep task blocked") ... but a bunch of places don't do that, notably /proc/${PID}/stack, where we don't pin the task in a blocked state, but do restrict the output to privileged users since commit: f8a00cef17206ecd ("proc: restrict kernel stack dumps to root") ... and so it's possible to trigger these warnings accidentally, e.g. by reading /proc/*/stack (as root): | for n in $(seq 1 10); do | while true; do cat /proc/*/stack > /dev/null 2>&1; done & | done | ------------[ cut here ]------------ | WARNING: CPU: 3 PID: 166 at arch/arm64/kernel/stacktrace.c:207 arch_stack_walk+0x1c8/0x370 | Modules linked in: | CPU: 3 UID: 0 PID: 166 Comm: cat Not tainted 6.13.0-rc2-00003-g3dafa7a7925d #2 | Hardware name: linux,dummy-virt (DT) | pstate: 81400005 (Nzcv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) | pc : arch_stack_walk+0x1c8/0x370 | lr : arch_stack_walk+0x1b0/0x370 | sp : ffff800080773890 | x29: ffff800080773930 x28: fff0000005c44500 x27: fff00000058fa038 | x26: 000000007ffff000 x25: 0000000000000000 x24: 0000000000000000 | x23: ffffa35a8d9600ec x22: 0000000000000000 x21: fff00000043a33c0 | x20: ffff800080773970 x19: ffffa35a8d960168 x18: 0000000000000000 | x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 | x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 | x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 | x8 : ffff8000807738e0 x7 : ffff8000806e3800 x6 : ffff8000806e3818 | x5 : ffff800080773920 x4 : ffff8000806e4000 x3 : ffff8000807738e0 | x2 : 0000000000000018 x1 : ffff8000806e3800 x0 : 0000000000000000 | Call trace: | arch_stack_walk+0x1c8/0x370 (P) | stack_trace_save_tsk+0x8c/0x108 | proc_pid_stack+0xb0/0x134 | proc_single_show+0x60/0x120 | seq_read_iter+0x104/0x438 | seq_read+0xf8/0x140 | vfs_read+0xc4/0x31c | ksys_read+0x70/0x108 | __arm64_sys_read+0x1c/0x28 | invoke_syscall+0x48/0x104 | el0_svc_common.constprop.0+0x40/0xe0 | do_el0_svc+0x1c/0x28 | el0_svc+0x30/0xcc | el0t_64_sync_handler+0x10c/0x138 | el0t_64_sync+0x198/0x19c | ---[ end trace 0000000000000000 ]--- Fix this by only warning when unwinding the current task. When unwinding another task the error conditions will be handled by returning an error without producing a warning. The two warnings in kunwind_next_frame_record_meta() were added recently as part of commit: c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries") The warning when recovering the fgraph return address has changed form many times, but was originally introduced back in commit: 9f416319f40cd857 ("arm64: fix unwind_frame() for filtered out fn for function graph tracing") Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries") Fixes: 9f416319f40c ("arm64: fix unwind_frame() for filtered out fn for function graph tracing") Signed-off-by: Mark Rutland Reported-by: Kent Overstreet Cc: Kees Cook Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20241211140704.2498712-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 4a08ad8158380..1d9d51d7627fd 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -137,8 +137,10 @@ kunwind_recover_return_address(struct kunwind_state *state) orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->common.pc, (void *)state->common.fp); - if (WARN_ON_ONCE(state->common.pc == orig_pc)) + if (state->common.pc == orig_pc) { + WARN_ON_ONCE(state->task == current); return -EINVAL; + } state->common.pc = orig_pc; state->flags.fgraph = 1; } @@ -199,12 +201,12 @@ kunwind_next_frame_record_meta(struct kunwind_state *state) case FRAME_META_TYPE_FINAL: if (meta == &task_pt_regs(tsk)->stackframe) return -ENOENT; - WARN_ON_ONCE(1); + WARN_ON_ONCE(tsk == current); return -EINVAL; case FRAME_META_TYPE_PT_REGS: return kunwind_next_regs_pc(state); default: - WARN_ON_ONCE(1); + WARN_ON_ONCE(tsk == current); return -EINVAL; } } From b10a1e5643e505c367c7e16aa6d8a9a0dc07354b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 3 Dec 2024 15:28:21 +0800 Subject: [PATCH 198/330] erofs: fix rare pcluster memory leak after unmounting There may still exist some pcluster with valid reference counts during unmounting. Instead of introducing another synchronization primitive, just try again as unmounting is relatively rare. This approach is similar to z_erofs_cache_invalidate_folio(). It was also reported by syzbot as a UAF due to commit f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached"): BUG: KASAN: slab-use-after-free in do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123 .. queued_spin_trylock include/asm-generic/qspinlock.h:92 [inline] do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123 __raw_spin_trylock include/linux/spinlock_api_smp.h:89 [inline] _raw_spin_trylock+0x20/0x80 kernel/locking/spinlock.c:138 spin_trylock include/linux/spinlock.h:361 [inline] z_erofs_put_pcluster fs/erofs/zdata.c:959 [inline] z_erofs_decompress_pcluster fs/erofs/zdata.c:1403 [inline] z_erofs_decompress_queue+0x3798/0x3ef0 fs/erofs/zdata.c:1425 z_erofs_decompressqueue_work+0x99/0xe0 fs/erofs/zdata.c:1437 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa68/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f2/0x390 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 However, it seems a long outstanding memory leak. Fix it now. Fixes: f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached") Reported-by: syzbot+7ff87b095e7ca0c5ac39@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/674c1235.050a0220.ad585.0032.GAE@google.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241203072821.1885740-1-hsiangkao@linux.alibaba.com --- fs/erofs/zutil.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 75704f58ecfa9..0dd65cefce33e 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -230,9 +230,10 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - /* clean up all remaining pclusters in memory */ - z_erofs_shrink_scan(sbi, ~0UL); - + while (!xa_empty(&sbi->managed_pslots)) { + z_erofs_shrink_scan(sbi, ~0UL); + cond_resched(); + } spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); spin_unlock(&erofs_sb_list_lock); From 1a2180f6859c73c674809f9f82e36c94084682ba Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 27 Nov 2024 16:52:36 +0800 Subject: [PATCH 199/330] erofs: fix PSI memstall accounting Max Kellermann recently reported psi_group_cpu.tasks[NR_MEMSTALL] is incorrect in the 6.11.9 kernel. The root cause appears to be that, since the problematic commit, bio can be NULL, causing psi_memstall_leave() to be skipped in z_erofs_submit_queue(). Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+8tvSowiJADW2RuKyofL_CSkm_SuyZA7ME5vMLWmL6pqw@mail.gmail.com Fixes: 9e2f9d34dd12 ("erofs: handle overlapped pclusters out of crafted images properly") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241127085236.3538334-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 01f1475054874..19ef4ff2a1345 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1792,9 +1792,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, erofs_fscache_submit_bio(bio); else submit_bio(bio); - if (memstall) - psi_memstall_leave(&pflags); } + if (memstall) + psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. From 6d1917045ef4f584593ad30b3dbb887d95fc331f Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 11 Dec 2024 16:09:18 +0800 Subject: [PATCH 200/330] MAINTAINERS: erofs: update Yue Hu's email address The current email address is no longer valid, use my gmail instead. Signed-off-by: Yue Hu Acked-by: Gao Xiang Acked-by: Chao Yu Link: https://lore.kernel.org/r/20241211080918.8512-1-zbestahu@163.com Signed-off-by: Gao Xiang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b13..ec84534ff87ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8451,7 +8451,7 @@ F: include/video/s1d13xxxfb.h EROFS FILE SYSTEM M: Gao Xiang M: Chao Yu -R: Yue Hu +R: Yue Hu R: Jeffle Xu R: Sandeep Dhavale L: linux-erofs@lists.ozlabs.org From e2de3c1bf6a0c99b089bd706a62da8f988918858 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:35:01 +0800 Subject: [PATCH 201/330] erofs: add erofs_sb_free() helper Unify the common parts of erofs_fc_free() and erofs_kill_sb() as erofs_sb_free(). Thus, fput() in erofs_fc_get_tree() is no longer needed, too. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212133504.2047178-1-hsiangkao@linux.alibaba.com --- fs/erofs/super.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index c235a8e4315ea..de8e3ecc6381d 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -703,16 +703,19 @@ static int erofs_fc_get_tree(struct fs_context *fc) GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { + struct file *file; + if (!fc->source) return invalf(fc, "No source specified"); - sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); - if (IS_ERR(sbi->fdev)) - return PTR_ERR(sbi->fdev); + + file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + sbi->fdev = file; if (S_ISREG(file_inode(sbi->fdev)->i_mode) && sbi->fdev->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); - fput(sbi->fdev); } #endif return ret; @@ -763,19 +766,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) kfree(devs); } -static void erofs_fc_free(struct fs_context *fc) +static void erofs_sb_free(struct erofs_sb_info *sbi) { - struct erofs_sb_info *sbi = fc->s_fs_info; - - if (!sbi) - return; - erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); } +static void erofs_fc_free(struct fs_context *fc) +{ + struct erofs_sb_info *sbi = fc->s_fs_info; + + if (sbi) /* free here if an error occurs before transferring to sb */ + erofs_sb_free(sbi); +} + static const struct fs_context_operations erofs_context_ops = { .parse_param = erofs_fc_parse_param, .get_tree = erofs_fc_get_tree, @@ -813,15 +821,9 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); - - erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->fsid); - kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); - kfree(sbi); + erofs_sb_free(sbi); sb->s_fs_info = NULL; } From 57e420c84f9ab55ba4c5e2ae9c5f6c8e1ea834d2 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 12 Dec 2024 10:13:29 -0700 Subject: [PATCH 202/330] blk-iocost: Avoid using clamp() on inuse in __propagate_weights() After a recent change to clamp() and its variants [1] that increases the coverage of the check that high is greater than low because it can be done through inlining, certain build configurations (such as s390 defconfig) fail to build with clang with: block/blk-iocost.c:1101:11: error: call to '__compiletime_assert_557' declared with 'error' attribute: clamp() low limit 1 greater than high limit active 1101 | inuse = clamp_t(u32, inuse, 1, active); | ^ include/linux/minmax.h:218:36: note: expanded from macro 'clamp_t' 218 | #define clamp_t(type, val, lo, hi) __careful_clamp(type, val, lo, hi) | ^ include/linux/minmax.h:195:2: note: expanded from macro '__careful_clamp' 195 | __clamp_once(type, val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) | ^ include/linux/minmax.h:188:2: note: expanded from macro '__clamp_once' 188 | BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ | ^ __propagate_weights() is called with an active value of zero in ioc_check_iocgs(), which results in the high value being less than the low value, which is undefined because the value returned depends on the order of the comparisons. The purpose of this expression is to ensure inuse is not more than active and at least 1. This could be written more simply with a ternary expression that uses min(inuse, active) as the condition so that the value of that condition can be used if it is not zero and one if it is. Do this conversion to resolve the error and add a comment to deter people from turning this back into clamp(). Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") Link: https://lore.kernel.org/r/34d53778977747f19cce2abb287bb3e6@AcuMS.aculab.com/ [1] Suggested-by: David Laight Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/llvm/CA+G9fYsD7mw13wredcZn0L-KBA3yeoVSTuxnss-AEWMN3ha0cA@mail.gmail.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412120322.3GfVe3vF-lkp@intel.com/ Signed-off-by: Nathan Chancellor Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 384aa15e8260b..a5894ec9696e7 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1098,7 +1098,14 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { - inuse = clamp_t(u32, inuse, 1, active); + /* + * It may be tempting to turn this into a clamp expression with + * a lower limit of 1 but active may be 0, which cannot be used + * as an upper limit in that situation. This expression allows + * active to clamp inuse unless it is 0, in which case inuse + * becomes 1. + */ + inuse = min(inuse, active) ?: 1; } iocg->last_inuse = iocg->inuse; From ac6542ad92759cda383ad62b4e4cbfc28136abc1 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Dec 2024 23:07:10 -0800 Subject: [PATCH 203/330] bpf: fix null dereference when computing changes_pkt_data of prog w/o subprogs bpf_prog_aux->func field might be NULL if program does not have subprograms except for main sub-program. The fixed commit does bpf_prog_aux->func access unconditionally, which might lead to null pointer dereference. The bug could be triggered by replacing the following BPF program: SEC("tc") int main_changes(struct __sk_buff *sk) { bpf_skb_pull_data(sk, 0); return 0; } With the following BPF program: SEC("freplace") long changes_pkt_data(struct __sk_buff *sk) { return bpf_skb_pull_data(sk, 0); } bpf_prog_aux instance itself represents the main sub-program, use this property to fix the bug. Fixes: 81f6d0530ba0 ("bpf: check changes_pkt_data property for extension programs") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202412111822.qGw6tOyB-lkp@intel.com/ Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241212070711.427443-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2e5d0e6e3d01..5e541339b2f6d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22193,6 +22193,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + bool tgt_changes_pkt_data; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22227,8 +22228,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension programs should be JITed\n"); return -EINVAL; } - if (prog->aux->changes_pkt_data && - !aux->func[subprog]->aux->changes_pkt_data) { + tgt_changes_pkt_data = aux->func + ? aux->func[subprog]->aux->changes_pkt_data + : aux->changes_pkt_data; + if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) { bpf_log(log, "Extension program changes packet data, while original does not\n"); return -EINVAL; From 04789af756a4a43e72986185f66f148e65b32fed Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Dec 2024 23:07:11 -0800 Subject: [PATCH 204/330] selftests/bpf: extend changes_pkt_data with cases w/o subprograms Extend changes_pkt_data tests with test cases freplacing the main program that does not have subprograms. Try four combinations when both main program and replacement do and do not change packet data. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241212070711.427443-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/changes_pkt_data.c | 55 +++++++++++++++---- .../selftests/bpf/progs/changes_pkt_data.c | 27 ++++++--- .../bpf/progs/changes_pkt_data_freplace.c | 6 +- 3 files changed, 66 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c index c0c7202f6c5c5..7526de3790814 100644 --- a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c +++ b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c @@ -10,10 +10,14 @@ static void print_verifier_log(const char *log) fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log); } -static void test_aux(const char *main_prog_name, const char *freplace_prog_name, bool expect_load) +static void test_aux(const char *main_prog_name, + const char *to_be_replaced, + const char *replacement, + bool expect_load) { struct changes_pkt_data_freplace *freplace = NULL; struct bpf_program *freplace_prog = NULL; + struct bpf_program *main_prog = NULL; LIBBPF_OPTS(bpf_object_open_opts, opts); struct changes_pkt_data *main = NULL; char log[16*1024]; @@ -26,6 +30,10 @@ static void test_aux(const char *main_prog_name, const char *freplace_prog_name, main = changes_pkt_data__open_opts(&opts); if (!ASSERT_OK_PTR(main, "changes_pkt_data__open")) goto out; + main_prog = bpf_object__find_program_by_name(main->obj, main_prog_name); + if (!ASSERT_OK_PTR(main_prog, "main_prog")) + goto out; + bpf_program__set_autoload(main_prog, true); err = changes_pkt_data__load(main); print_verifier_log(log); if (!ASSERT_OK(err, "changes_pkt_data__load")) @@ -33,14 +41,14 @@ static void test_aux(const char *main_prog_name, const char *freplace_prog_name, freplace = changes_pkt_data_freplace__open_opts(&opts); if (!ASSERT_OK_PTR(freplace, "changes_pkt_data_freplace__open")) goto out; - freplace_prog = bpf_object__find_program_by_name(freplace->obj, freplace_prog_name); + freplace_prog = bpf_object__find_program_by_name(freplace->obj, replacement); if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog")) goto out; bpf_program__set_autoload(freplace_prog, true); bpf_program__set_autoattach(freplace_prog, true); bpf_program__set_attach_target(freplace_prog, - bpf_program__fd(main->progs.dummy), - main_prog_name); + bpf_program__fd(main_prog), + to_be_replaced); err = changes_pkt_data_freplace__load(freplace); print_verifier_log(log); if (expect_load) { @@ -62,15 +70,38 @@ static void test_aux(const char *main_prog_name, const char *freplace_prog_name, * that either do or do not. It is only ok to freplace subprograms * that do not change packet data with those that do not as well. * The below tests check outcomes for each combination of such freplace. + * Also test a case when main subprogram itself is replaced and is a single + * subprogram in a program. */ void test_changes_pkt_data_freplace(void) { - if (test__start_subtest("changes_with_changes")) - test_aux("changes_pkt_data", "changes_pkt_data", true); - if (test__start_subtest("changes_with_doesnt_change")) - test_aux("changes_pkt_data", "does_not_change_pkt_data", true); - if (test__start_subtest("doesnt_change_with_changes")) - test_aux("does_not_change_pkt_data", "changes_pkt_data", false); - if (test__start_subtest("doesnt_change_with_doesnt_change")) - test_aux("does_not_change_pkt_data", "does_not_change_pkt_data", true); + struct { + const char *main; + const char *to_be_replaced; + bool changes; + } mains[] = { + { "main_with_subprogs", "changes_pkt_data", true }, + { "main_with_subprogs", "does_not_change_pkt_data", false }, + { "main_changes", "main_changes", true }, + { "main_does_not_change", "main_does_not_change", false }, + }; + struct { + const char *func; + bool changes; + } replacements[] = { + { "changes_pkt_data", true }, + { "does_not_change_pkt_data", false } + }; + char buf[64]; + + for (int i = 0; i < ARRAY_SIZE(mains); ++i) { + for (int j = 0; j < ARRAY_SIZE(replacements); ++j) { + snprintf(buf, sizeof(buf), "%s_with_%s", + mains[i].to_be_replaced, replacements[j].func); + if (!test__start_subtest(buf)) + continue; + test_aux(mains[i].main, mains[i].to_be_replaced, replacements[j].func, + mains[i].changes || !replacements[j].changes); + } + } } diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data.c b/tools/testing/selftests/bpf/progs/changes_pkt_data.c index f87da8e9d6b33..43cada48b28ad 100644 --- a/tools/testing/selftests/bpf/progs/changes_pkt_data.c +++ b/tools/testing/selftests/bpf/progs/changes_pkt_data.c @@ -4,22 +4,35 @@ #include __noinline -long changes_pkt_data(struct __sk_buff *sk, __u32 len) +long changes_pkt_data(struct __sk_buff *sk) { - return bpf_skb_pull_data(sk, len); + return bpf_skb_pull_data(sk, 0); } __noinline __weak -long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len) +long does_not_change_pkt_data(struct __sk_buff *sk) { return 0; } -SEC("tc") -int dummy(struct __sk_buff *sk) +SEC("?tc") +int main_with_subprogs(struct __sk_buff *sk) +{ + changes_pkt_data(sk); + does_not_change_pkt_data(sk); + return 0; +} + +SEC("?tc") +int main_changes(struct __sk_buff *sk) +{ + bpf_skb_pull_data(sk, 0); + return 0; +} + +SEC("?tc") +int main_does_not_change(struct __sk_buff *sk) { - changes_pkt_data(sk, 0); - does_not_change_pkt_data(sk, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c index 0e525beb8603c..f9a622705f1b3 100644 --- a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c +++ b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c @@ -4,13 +4,13 @@ #include SEC("?freplace") -long changes_pkt_data(struct __sk_buff *sk, __u32 len) +long changes_pkt_data(struct __sk_buff *sk) { - return bpf_skb_pull_data(sk, len); + return bpf_skb_pull_data(sk, 0); } SEC("?freplace") -long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len) +long does_not_change_pkt_data(struct __sk_buff *sk) { return 0; } From 659b9ba7cb2d7adb64618b87ddfaa528a143766e Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 12 Dec 2024 01:20:49 -0800 Subject: [PATCH 205/330] bpf: Check size for BTF-based ctx access of pointer members Robert Morris reported the following program type which passes the verifier in [0]: SEC("struct_ops/bpf_cubic_init") void BPF_PROG(bpf_cubic_init, struct sock *sk) { asm volatile("r2 = *(u16*)(r1 + 0)"); // verifier should demand u64 asm volatile("*(u32 *)(r2 +1504) = 0"); // 1280 in some configs } The second line may or may not work, but the first instruction shouldn't pass, as it's a narrow load into the context structure of the struct ops callback. The code falls back to btf_ctx_access to ensure correctness and obtaining the types of pointers. Ensure that the size of the access is correctly checked to be 8 bytes, otherwise the verifier thinks the narrow load obtained a trusted BTF pointer and will permit loads/stores as it sees fit. Perform the check on size after we've verified that the load is for a pointer field, as for scalar values narrow loads are fine. Access to structs passed as arguments to a BPF program are also treated as scalars, therefore no adjustment is needed in their case. Existing verifier selftests are broken by this change, but because they were incorrect. Verifier tests for d_path were performing narrow load into context to obtain path pointer, had this program actually run it would cause a crash. The same holds for verifier_btf_ctx_access tests. [0]: https://lore.kernel.org/bpf/51338.1732985814@localhost Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF") Reported-by: Robert Morris Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241212092050.3204165-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 6 ++++++ tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c | 4 ++-- tools/testing/selftests/bpf/progs/verifier_d_path.c | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e7a59e6462a93..a63a03582f02d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6543,6 +6543,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } + if (size != sizeof(u64)) { + bpf_log(log, "func '%s' size %d must be 8\n", + tname, size); + return false; + } + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c index a570e48b917ac..bfc3bf18fed4f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c @@ -11,7 +11,7 @@ __success __retval(0) __naked void btf_ctx_access_accept(void) { asm volatile (" \ - r2 = *(u32*)(r1 + 8); /* load 2nd argument value (int pointer) */\ + r2 = *(u64 *)(r1 + 8); /* load 2nd argument value (int pointer) */\ r0 = 0; \ exit; \ " ::: __clobber_all); @@ -23,7 +23,7 @@ __success __retval(0) __naked void ctx_access_u32_pointer_accept(void) { asm volatile (" \ - r2 = *(u32*)(r1 + 0); /* load 1nd argument value (u32 pointer) */\ + r2 = *(u64 *)(r1 + 0); /* load 1nd argument value (u32 pointer) */\ r0 = 0; \ exit; \ " ::: __clobber_all); diff --git a/tools/testing/selftests/bpf/progs/verifier_d_path.c b/tools/testing/selftests/bpf/progs/verifier_d_path.c index ec79cbcfde91e..87e51a215558f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_d_path.c +++ b/tools/testing/selftests/bpf/progs/verifier_d_path.c @@ -11,7 +11,7 @@ __success __retval(0) __naked void d_path_accept(void) { asm volatile (" \ - r1 = *(u32*)(r1 + 0); \ + r1 = *(u64 *)(r1 + 0); \ r2 = r10; \ r2 += -8; \ r6 = 0; \ @@ -31,7 +31,7 @@ __failure __msg("helper call is not allowed in probe") __naked void d_path_reject(void) { asm volatile (" \ - r1 = *(u32*)(r1 + 0); \ + r1 = *(u64 *)(r1 + 0); \ r2 = r10; \ r2 += -8; \ r6 = 0; \ From 8025731c28beb4700dc801a1ca4504d1f78bac27 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 12 Dec 2024 01:20:50 -0800 Subject: [PATCH 206/330] selftests/bpf: Add test for narrow ctx load for pointer args Ensure that performing narrow ctx loads other than size == 8 are rejected when the argument is a pointer type. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241212092050.3204165-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_btf_ctx_access.c | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c index bfc3bf18fed4f..28b939572cdad 100644 --- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c @@ -29,4 +29,40 @@ __naked void ctx_access_u32_pointer_accept(void) " ::: __clobber_all); } +SEC("fentry/bpf_fentry_test9") +__description("btf_ctx_access u32 pointer reject u32") +__failure __msg("size 4 must be 8") +__naked void ctx_access_u32_pointer_reject_32(void) +{ + asm volatile (" \ + r2 = *(u32 *)(r1 + 0); /* load 1st argument with narrow load */\ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + +SEC("fentry/bpf_fentry_test9") +__description("btf_ctx_access u32 pointer reject u16") +__failure __msg("size 2 must be 8") +__naked void ctx_access_u32_pointer_reject_16(void) +{ + asm volatile (" \ + r2 = *(u16 *)(r1 + 0); /* load 1st argument with narrow load */\ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + +SEC("fentry/bpf_fentry_test9") +__description("btf_ctx_access u32 pointer reject u8") +__failure __msg("size 1 must be 8") +__naked void ctx_access_u32_pointer_reject_8(void) +{ + asm volatile (" \ + r2 = *(u8 *)(r1 + 0); /* load 1st argument with narrow load */\ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; From a440a28ddbdcb861150987b4d6e828631656b92f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:24 -0800 Subject: [PATCH 207/330] xfs: fix off-by-one error in fsmap's end_daddr usage In commit ca6448aed4f10a, we created an "end_daddr" variable to fix fsmap reporting when the end of the range requested falls in the middle of an unknown (aka free on the rmapbt) region. Unfortunately, I didn't notice that the the code sets end_daddr to the last sector of the device but then uses that quantity to compute the length of the synthesized mapping. Zizhi Wo later observed that when end_daddr isn't set, we still don't report the last fsblock on a device because in that case (aka when info->last is true), the info->high mapping that we pass to xfs_getfsmap_group_helper has a startblock that points to the last fsblock. This is also wrong because the code uses startblock to compute the length of the synthesized mapping. Fix the second problem by setting end_daddr unconditionally, and fix the first problem by setting start_daddr to one past the end of the range to query. Cc: # v6.11 Fixes: ca6448aed4f10a ("xfs: Fix missing interval for missing_owner in xfs fsmap") Signed-off-by: "Darrick J. Wong" Reported-by: Zizhi Wo Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_fsmap.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 82f2e0dd22499..3290dd8524a69 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -163,7 +163,8 @@ struct xfs_getfsmap_info { xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; - xfs_daddr_t end_daddr; /* daddr of high fsmap key */ + /* daddr of high fsmap key, or the last daddr on the device */ + xfs_daddr_t end_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* @@ -387,8 +388,8 @@ xfs_getfsmap_group_helper( * we calculated from userspace's high key to synthesize the record. * Note that if the btree query found a mapping, there won't be a gap. */ - if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) - frec->start_daddr = info->end_daddr; + if (info->last) + frec->start_daddr = info->end_daddr + 1; else frec->start_daddr = xfs_gbno_to_daddr(xg, startblock); @@ -736,11 +737,10 @@ xfs_getfsmap_rtdev_rtbitmap_helper( * we calculated from userspace's high key to synthesize the record. * Note that if the btree query found a mapping, there won't be a gap. */ - if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) { - frec.start_daddr = info->end_daddr; - } else { + if (info->last) + frec.start_daddr = info->end_daddr + 1; + else frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb); - } frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount); return xfs_getfsmap_helper(tp, info, &frec); @@ -933,7 +933,10 @@ xfs_getfsmap( struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; - struct xfs_getfsmap_info info = { NULL }; + struct xfs_getfsmap_info info = { + .fsmap_recs = fsmap_recs, + .head = head, + }; bool use_rmap; int i; int error = 0; @@ -998,9 +1001,6 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.end_daddr = XFS_BUF_DADDR_NULL; - info.fsmap_recs = fsmap_recs; - info.head = head; /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { @@ -1013,17 +1013,23 @@ xfs_getfsmap( break; /* - * If this device number matches the high key, we have - * to pass the high key to the handler to limit the - * query results. If the device number exceeds the - * low key, zero out the low key so that we get - * everything from the beginning. + * If this device number matches the high key, we have to pass + * the high key to the handler to limit the query results, and + * set the end_daddr so that we can synthesize records at the + * end of the query range or device. */ if (handlers[i].dev == head->fmh_keys[1].fmr_device) { dkeys[1] = head->fmh_keys[1]; info.end_daddr = min(handlers[i].nr_sectors - 1, dkeys[1].fmr_physical); + } else { + info.end_daddr = handlers[i].nr_sectors - 1; } + + /* + * If the device number exceeds the low key, zero out the low + * key so that we get everything from the beginning. + */ if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); From 9b7280010366dbe32791acd498a37dc522f568db Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:24 -0800 Subject: [PATCH 208/330] xfs: metapath scrubber should use the already loaded inodes Don't waste time in xchk_setup_metapath_dqinode doing a second lookup of the quota inodes, just grab them from the quotainfo structure. The whole point of this scrubber is to make sure that the dirents exist, so it's completely silly to do lookups. Cc: # v6.13-rc1 Fixes: 128a055291ebbc ("xfs: scrub quota file metapaths") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/metapath.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index b78db65134651..80467d6bc7638 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -196,36 +196,45 @@ xchk_setup_metapath_dqinode( struct xfs_scrub *sc, xfs_dqtype_t type) { + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; struct xfs_trans *tp = NULL; struct xfs_inode *dp = NULL; struct xfs_inode *ip = NULL; - const char *path; int error; + if (!qi) + return -ENOENT; + + switch (type) { + case XFS_DQTYPE_USER: + ip = qi->qi_uquotaip; + break; + case XFS_DQTYPE_GROUP: + ip = qi->qi_gquotaip; + break; + case XFS_DQTYPE_PROJ: + ip = qi->qi_pquotaip; + break; + default: + ASSERT(0); + return -EINVAL; + } + if (!ip) + return -ENOENT; + error = xfs_trans_alloc_empty(sc->mp, &tp); if (error) return error; error = xfs_dqinode_load_parent(tp, &dp); - if (error) - goto out_cancel; - - error = xfs_dqinode_load(tp, dp, type, &ip); - if (error) - goto out_dp; - xfs_trans_cancel(tp); - tp = NULL; + if (error) + return error; - path = kasprintf(GFP_KERNEL, "%s", xfs_dqinode_path(type)); - error = xchk_setup_metapath_scan(sc, dp, path, ip); + error = xchk_setup_metapath_scan(sc, dp, + kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); - xfs_irele(ip); -out_dp: xfs_irele(dp); -out_cancel: - if (tp) - xfs_trans_cancel(tp); return error; } #else From e1d8602b6cfb757952827d11c7d26f2a1714fe82 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:25 -0800 Subject: [PATCH 209/330] xfs: keep quota directory inode loaded In the same vein as the previous patch, there's no point in the metapath scrub setup function doing a lookup on the quota metadir just so it can validate that lookups work correctly. Instead, retain the quota directory inode in memory for the lifetime of the mount so that we can check this meaningfully. Cc: # v6.13-rc1 Fixes: 128a055291ebbc ("xfs: scrub quota file metapaths") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/metapath.c | 37 ++++++-------------------------- fs/xfs/xfs_qm.c | 47 ++++++++++++++++++++++------------------- fs/xfs/xfs_qm.h | 1 + 3 files changed, 32 insertions(+), 53 deletions(-) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 80467d6bc7638..c678cba1ffc3f 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -171,23 +171,13 @@ static int xchk_setup_metapath_quotadir( struct xfs_scrub *sc) { - struct xfs_trans *tp; - struct xfs_inode *dp = NULL; - int error; - - error = xfs_trans_alloc_empty(sc->mp, &tp); - if (error) - return error; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; - error = xfs_dqinode_load_parent(tp, &dp); - xfs_trans_cancel(tp); - if (error) - return error; + if (!qi || !qi->qi_dirip) + return -ENOENT; - error = xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "quota"), dp); - xfs_irele(dp); - return error; + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kstrdup("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -197,10 +187,7 @@ xchk_setup_metapath_dqinode( xfs_dqtype_t type) { struct xfs_quotainfo *qi = sc->mp->m_quotainfo; - struct xfs_trans *tp = NULL; - struct xfs_inode *dp = NULL; struct xfs_inode *ip = NULL; - int error; if (!qi) return -ENOENT; @@ -222,20 +209,8 @@ xchk_setup_metapath_dqinode( if (!ip) return -ENOENT; - error = xfs_trans_alloc_empty(sc->mp, &tp); - if (error) - return error; - - error = xfs_dqinode_load_parent(tp, &dp); - xfs_trans_cancel(tp); - if (error) - return error; - - error = xchk_setup_metapath_scan(sc, dp, + return xchk_setup_metapath_scan(sc, qi->qi_dirip, kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); - - xfs_irele(dp); - return error; } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 61ee110b47d7d..3c0189831f14d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -241,6 +241,10 @@ xfs_qm_destroy_quotainos( xfs_irele(qi->qi_pquotaip); qi->qi_pquotaip = NULL; } + if (qi->qi_dirip) { + xfs_irele(qi->qi_dirip); + qi->qi_dirip = NULL; + } } /* @@ -646,8 +650,7 @@ xfs_qm_init_timelimits( static int xfs_qm_load_metadir_qinos( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct xfs_inode **dpp) + struct xfs_quotainfo *qi) { struct xfs_trans *tp; int error; @@ -656,7 +659,7 @@ xfs_qm_load_metadir_qinos( if (error) return error; - error = xfs_dqinode_load_parent(tp, dpp); + error = xfs_dqinode_load_parent(tp, &qi->qi_dirip); if (error == -ENOENT) { /* no quota dir directory, but we'll create one later */ error = 0; @@ -666,21 +669,21 @@ xfs_qm_load_metadir_qinos( goto out_trans; if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_USER, + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_USER, &qi->qi_uquotaip); if (error && error != -ENOENT) goto out_trans; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_GROUP, + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_GROUP, &qi->qi_gquotaip); if (error && error != -ENOENT) goto out_trans; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_PROJ, + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_PROJ, &qi->qi_pquotaip); if (error && error != -ENOENT) goto out_trans; @@ -696,34 +699,33 @@ xfs_qm_load_metadir_qinos( STATIC int xfs_qm_create_metadir_qinos( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct xfs_inode **dpp) + struct xfs_quotainfo *qi) { int error; - if (!*dpp) { - error = xfs_dqinode_mkdir_parent(mp, dpp); + if (!qi->qi_dirip) { + error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip); if (error && error != -EEXIST) return error; } if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) { - error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_USER, - &qi->qi_uquotaip); + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_USER, &qi->qi_uquotaip); if (error) return error; } if (XFS_IS_GQUOTA_ON(mp) && !qi->qi_gquotaip) { - error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_GROUP, - &qi->qi_gquotaip); + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_GROUP, &qi->qi_gquotaip); if (error) return error; } if (XFS_IS_PQUOTA_ON(mp) && !qi->qi_pquotaip) { - error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_PROJ, - &qi->qi_pquotaip); + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_PROJ, &qi->qi_pquotaip); if (error) return error; } @@ -768,7 +770,6 @@ xfs_qm_init_metadir_qinos( struct xfs_mount *mp) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_inode *dp = NULL; int error; if (!xfs_has_quota(mp)) { @@ -777,20 +778,22 @@ xfs_qm_init_metadir_qinos( return error; } - error = xfs_qm_load_metadir_qinos(mp, qi, &dp); + error = xfs_qm_load_metadir_qinos(mp, qi); if (error) goto out_err; - error = xfs_qm_create_metadir_qinos(mp, qi, &dp); + error = xfs_qm_create_metadir_qinos(mp, qi); if (error) goto out_err; - xfs_irele(dp); + /* The only user of the quota dir inode is online fsck */ +#if !IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) + xfs_irele(qi->qi_dirip); + qi->qi_dirip = NULL; +#endif return 0; out_err: xfs_qm_destroy_quotainos(mp->m_quotainfo); - if (dp) - xfs_irele(dp); return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index e919c7f62f578..35b64bc3a7a86 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -55,6 +55,7 @@ struct xfs_quotainfo { struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; int qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ From bd27c7bcdca25ce8067ebb94ded6ac1bd7b47317 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:26 -0800 Subject: [PATCH 210/330] xfs: return a 64-bit block count from xfs_btree_count_blocks With the nrext64 feature enabled, it's possible for a data fork to have 2^48 extent mappings. Even with a 64k fsblock size, that maps out to a bmbt containing more than 2^32 blocks. Therefore, this predicate must return a u64 count to avoid an integer wraparound that will cause scrub to do the wrong thing. It's unlikely that any such filesystem currently exists, because the incore bmbt would consume more than 64GB of kernel memory on its own, and so far nobody except me has driven a filesystem that far, judging from the lack of complaints. Cc: # v5.19 Fixes: df9ad5cc7a5240 ("xfs: Introduce macros to represent new maximum extent counts for data/attr forks") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 4 ++-- fs/xfs/libxfs/xfs_btree.h | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 4 +++- fs/xfs/scrub/agheader.c | 6 +++--- fs/xfs/scrub/agheader_repair.c | 6 +++--- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/ialloc.c | 4 ++-- fs/xfs/scrub/refcount.c | 2 +- fs/xfs/xfs_bmap_util.c | 2 +- 9 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2b5fc5fd16435..c748866ef9236 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -5144,7 +5144,7 @@ xfs_btree_count_blocks_helper( int level, void *data) { - xfs_extlen_t *blocks = data; + xfs_filblks_t *blocks = data; (*blocks)++; return 0; @@ -5154,7 +5154,7 @@ xfs_btree_count_blocks_helper( int xfs_btree_count_blocks( struct xfs_btree_cur *cur, - xfs_extlen_t *blocks) + xfs_filblks_t *blocks) { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3b739459ebb0f..c5bff273cae25 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -484,7 +484,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); -int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b34896dd1a32..6f270d8f4270c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -744,6 +744,7 @@ xfs_finobt_count_blocks( { struct xfs_buf *agbp = NULL; struct xfs_btree_cur *cur; + xfs_filblks_t blocks; int error; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); @@ -751,9 +752,10 @@ xfs_finobt_count_blocks( return error; cur = xfs_finobt_init_cursor(pag, tp, agbp); - error = xfs_btree_count_blocks(cur, tree_blocks); + error = xfs_btree_count_blocks(cur, &blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); + *tree_blocks = blocks; return error; } diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 61f80a6410c73..1d41b85478da9 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -458,7 +458,7 @@ xchk_agf_xref_btreeblks( { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; - xfs_agblock_t blocks; + xfs_filblks_t blocks; xfs_agblock_t btreeblks; int error; @@ -507,7 +507,7 @@ xchk_agf_xref_refcblks( struct xfs_scrub *sc) { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; if (!sc->sa.refc_cur) @@ -840,7 +840,7 @@ xchk_agi_xref_fiblocks( struct xfs_scrub *sc) { struct xfs_agi *agi = sc->sa.agi_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error = 0; if (!xfs_has_inobtcounts(sc->mp)) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 0fad0baaba2f6..b45d2b32051a6 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -256,7 +256,7 @@ xrep_agf_calc_from_btrees( struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; /* Update the AGF counters from the bnobt. */ @@ -946,7 +946,7 @@ xrep_agi_calc_from_btrees( if (error) goto err; if (xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; error = xfs_btree_count_blocks(cur, &blocks); if (error) @@ -959,7 +959,7 @@ xrep_agi_calc_from_btrees( agi->agi_freecount = cpu_to_be32(freecount); if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 4a50f8e000409..ca23cf4db6c5e 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -261,7 +261,7 @@ xchk_fscount_btreeblks( struct xchk_fscounters *fsc, xfs_agnumber_t agno) { - xfs_extlen_t blocks; + xfs_filblks_t blocks; int error; error = xchk_ag_init_existing(sc, agno, &sc->sa); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index abad54c3621d4..4dc7c83dc08a4 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -650,8 +650,8 @@ xchk_iallocbt_xref_rmap_btreeblks( struct xfs_scrub *sc) { xfs_filblks_t blocks; - xfs_extlen_t inobt_blocks = 0; - xfs_extlen_t finobt_blocks = 0; + xfs_filblks_t inobt_blocks = 0; + xfs_filblks_t finobt_blocks = 0; int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 2b6be75e94241..1c5e45cc64190 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -491,7 +491,7 @@ xchk_refcount_xref_rmap( struct xfs_scrub *sc, xfs_filblks_t cow_blocks) { - xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t refcbt_blocks = 0; xfs_filblks_t blocks; int error; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a59bbe767a7dc..0836fea2d6d81 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -103,7 +103,7 @@ xfs_bmap_count_blocks( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; - xfs_extlen_t btblocks = 0; + xfs_filblks_t btblocks = 0; int error; *nextents = 0; From 7ce31f20a0771d71779c3b0ec9cdf474cc3c8e9a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:27 -0800 Subject: [PATCH 211/330] xfs: don't drop errno values when we fail to ficlone the entire range Way back when we first implemented FICLONE for XFS, life was simple -- either the the entire remapping completed, or something happened and we had to return an errno explaining what happened. Neither of those ioctls support returning partial results, so it's all or nothing. Then things got complicated when copy_file_range came along, because it actually can return the number of bytes copied, so commit 3f68c1f562f1e4 tried to make it so that we could return a partial result if the REMAP_FILE_CAN_SHORTEN flag is set. This is also how FIDEDUPERANGE can indicate that the kernel performed a partial deduplication. Unfortunately, the logic is wrong if an error stops the remapping and CAN_SHORTEN is not set. Because those callers cannot return partial results, it is an error for ->remap_file_range to return a positive quantity that is less than the @len passed in. Implementations really should be returning a negative errno in this case, because that's what btrfs (which introduced FICLONE{,RANGE}) did. Therefore, ->remap_range implementations cannot silently drop an errno that they might have when the number of bytes remapped is less than the number of bytes requested and CAN_SHORTEN is not set. Found by running generic/562 on a 64k fsblock filesystem and wondering why it reported corrupt files. Cc: # v4.20 Fixes: 3fc9f5e409319e ("xfs: remove xfs_reflink_remap_range") Really-Fixes: 3f68c1f562f1e4 ("xfs: support returning partial reflink results") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_file.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4a0b7de4f7aed..9a435b1ff2647 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1242,6 +1242,14 @@ xfs_file_remap_range( xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + /* + * If the caller did not set CAN_SHORTEN, then it is not prepared to + * handle partial results -- either the whole remap succeeds, or we + * must say why it did not. In this case, any error should be returned + * to the caller. + */ + if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return ret; return remapped > 0 ? remapped : ret; } From aa7bfb537edf62085d7718845f6644b0e4efb9df Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:27 -0800 Subject: [PATCH 212/330] xfs: separate healthy clearing mask during repair In commit d9041681dd2f53 we introduced some XFS_SICK_*ZAPPED flags so that the inode record repair code could clean up a damaged inode record enough to iget the inode but still be able to remember that the higher level repair code needs to be called. As part of that, we introduced a xchk_mark_healthy_if_clean helper that is supposed to cause the ZAPPED state to be removed if that higher level metadata actually checks out. This was done by setting additional bits in sick_mask hoping that xchk_update_health will clear all those bits after a healthy scrub. Unfortunately, that's not quite what sick_mask means -- bits in that mask are indeed cleared if the metadata is healthy, but they're set if the metadata is NOT healthy. fsck is only intended to set the ZAPPED bits explicitly. If something else sets the CORRUPT/XCORRUPT state after the xchk_mark_healthy_if_clean call, we end up marking the metadata zapped. This can happen if the following sequence happens: 1. Scrub runs, discovers that the metadata is fine but could be optimized and calls xchk_mark_healthy_if_clean on a ZAPPED flag. That causes the ZAPPED flag to be set in sick_mask because the metadata is not CORRUPT or XCORRUPT. 2. Repair runs to optimize the metadata. 3. Some other metadata used for cross-referencing in (1) becomes corrupt. 4. Post-repair scrub runs, but this time it sets CORRUPT or XCORRUPT due to the events in (3). 5. Now the xchk_health_update sets the ZAPPED flag on the metadata we just repaired. This is not the correct state. Fix this by moving the "if healthy" mask to a separate field, and only ever using it to clear the sick state. Cc: # v6.8 Fixes: d9041681dd2f53 ("xfs: set inode sick state flags when we zap either ondisk fork") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/health.c | 57 +++++++++++++++++++++++++------------------ fs/xfs/scrub/scrub.h | 6 +++++ 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index ce86bdad37fa4..ccc6ca5934ca6 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -71,7 +71,8 @@ /* Map our scrub type to a sick mask and a set of health update functions. */ enum xchk_health_group { - XHG_FS = 1, + XHG_NONE = 1, + XHG_FS, XHG_AG, XHG_INO, XHG_RTGROUP, @@ -83,6 +84,7 @@ struct xchk_health_map { }; static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_PROBE] = { XHG_NONE, 0 }, [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, @@ -133,7 +135,7 @@ xchk_mark_healthy_if_clean( { if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT))) - sc->sick_mask |= mask; + sc->healthy_mask |= mask; } /* @@ -189,6 +191,7 @@ xchk_update_health( { struct xfs_perag *pag; struct xfs_rtgroup *rtg; + unsigned int mask = sc->sick_mask; bool bad; /* @@ -203,50 +206,56 @@ xchk_update_health( return; } - if (!sc->sick_mask) - return; - bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT)); + if (!bad) + mask |= sc->healthy_mask; switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_NONE: + break; case XHG_AG: + if (!mask) + return; pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_group_mark_corrupt(pag_group(pag), sc->sick_mask); + xfs_group_mark_corrupt(pag_group(pag), mask); else - xfs_group_mark_healthy(pag_group(pag), sc->sick_mask); + xfs_group_mark_healthy(pag_group(pag), mask); xfs_perag_put(pag); break; case XHG_INO: if (!sc->ip) return; - if (bad) { - unsigned int mask = sc->sick_mask; - - /* - * If we're coming in for repairs then we don't want - * sickness flags to propagate to the incore health - * status if the inode gets inactivated before we can - * fix it. - */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - mask |= XFS_SICK_INO_FORGET; + /* + * If we're coming in for repairs then we don't want sickness + * flags to propagate to the incore health status if the inode + * gets inactivated before we can fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + if (!mask) + return; + if (bad) xfs_inode_mark_corrupt(sc->ip, mask); - } else - xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, mask); break; case XHG_FS: + if (!mask) + return; if (bad) - xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, mask); else - xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + xfs_fs_mark_healthy(sc->mp, mask); break; case XHG_RTGROUP: + if (!mask) + return; rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_group_mark_corrupt(rtg_group(rtg), sc->sick_mask); + xfs_group_mark_corrupt(rtg_group(rtg), mask); else - xfs_group_mark_healthy(rtg_group(rtg), sc->sick_mask); + xfs_group_mark_healthy(rtg_group(rtg), mask); xfs_rtgroup_put(rtg); break; default: diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index a7fda3e2b0137..5dbbe93cb49bf 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -184,6 +184,12 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* + * Clear these XFS_SICK_* flags but only if the scan is ok. Useful for + * removing ZAPPED flags after a repair. + */ + unsigned int healthy_mask; + /* next time we want to cond_resched() */ struct xchk_relax relax; From 6f4669708a69fd21f0299c2d5c4780a6ce358ab5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:28 -0800 Subject: [PATCH 213/330] xfs: set XFS_SICK_INO_SYMLINK_ZAPPED explicitly when zapping a symlink If we need to reset a symlink target to the "durr it's busted" string, then we clear the zapped flag as well. However, this should be using the provided helper so that we don't set the zapped state on an otherwise ok symlink. Cc: # v6.10 Fixes: 2651923d8d8db0 ("xfs: online repair of symbolic links") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/symlink_repair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index d015a86ef460f..953ce7be78dc2 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -36,6 +36,7 @@ #include "scrub/tempfile.h" #include "scrub/tempexch.h" #include "scrub/reap.h" +#include "scrub/health.h" /* * Symbolic Link Repair @@ -233,7 +234,7 @@ xrep_symlink_salvage( * target zapped flag. */ if (buflen == 0) { - sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); sprintf(target_buf, DUMMY_TARGET); } From dc5a0527398d42e4d3e47abe8a43960fca0314ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:29 -0800 Subject: [PATCH 214/330] xfs: mark metadir repair tempfiles with IRECOVERY Once in a long while, xfs/566 and xfs/801 report directory corruption in one of the metadata subdirectories while it's forcibly rebuilding all filesystem metadata. I observed the following sequence of events: 1. Initiate a repair of the parent pointers for the /quota/user file. This is the secret file containing user quota data. 2. The pptr repair thread creates a temporary file and begins staging parent pointers in the ondisk metadata in preparation for an exchange-range to commit the new pptr data. 3. At the same time, initiate a repair of the /quota directory itself. 4. The dir repair thread finds the temporary file from (2), scans it for parent pointers, and stages a dirent in its own temporary dir in preparation to commit the fixed directory. 5. The parent pointer repair completes and frees the temporary file. 6. The dir repair commits the new directory and scans it again. It finds the dirent that points to the old temporary file in (2) and marks the directory corrupt. Oops! Repair code must never scan the temporary files that other repair functions create to stage new metadata. They're not supposed to do that, but the predicate function xrep_is_tempfile is incorrect because it assumes that any XFS_DIFLAG2_METADATA file cannot ever be a temporary file, but xrep_tempfile_adjust_directory_tree creates exactly that. Fix this by setting the IRECOVERY flag on temporary metadata directory inodes and using that to correct the predicate. Repair code is supposed to erase all the data in temporary files before releasing them, so it's ok if a thread scans the temporary file after we drop IRECOVERY. Cc: # v6.13-rc1 Fixes: bb6cdd5529ff67 ("xfs: hide metadata inodes from everyone because they are special") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/tempfile.c | 10 ++++++++-- fs/xfs/xfs_inode.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index 4b7f7860e37ec..dc3802c7f678c 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -223,6 +223,7 @@ xrep_tempfile_adjust_directory_tree( if (error) goto out_ilock; + xfs_iflags_set(sc->tempip, XFS_IRECOVERY); xfs_qm_dqdetach(sc->tempip); out_ilock: xrep_tempfile_iunlock(sc); @@ -246,6 +247,8 @@ xrep_tempfile_remove_metadir( ASSERT(sc->tp == NULL); + xfs_iflags_clear(sc->tempip, XFS_IRECOVERY); + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; @@ -945,10 +948,13 @@ xrep_is_tempfile( /* * Files in the metadata directory tree also have S_PRIVATE set and - * IOP_XATTR unset, so we must distinguish them separately. + * IOP_XATTR unset, so we must distinguish them separately. We (ab)use + * the IRECOVERY flag to mark temporary metadir inodes knowing that the + * end of log recovery clears IRECOVERY, so the only ones that can + * exist during online repair are the ones we create. */ if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA)) - return false; + return __xfs_iflags_test(ip, XFS_IRECOVERY); if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) return true; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b0de3d924d4c1..1648dc5a80688 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -231,7 +231,7 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) } static inline int -__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) +__xfs_iflags_test(const struct xfs_inode *ip, unsigned long flags) { return (ip->i_flags & flags); } From af9f02457f461b23307fe826a37be61ba6e32c92 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:30 -0800 Subject: [PATCH 215/330] xfs: fix null bno_hint handling in xfs_rtallocate_rtg xfs_bmap_rtalloc initializes the bno_hint variable to NULLRTBLOCK (aka NULLFSBLOCK). If the allocation request is for a file range that's adjacent to an existing mapping, it will then change bno_hint to the blkno hint in the bmalloca structure. In other words, bno_hint is either a rt block number, or it's all 1s. Unfortunately, commit ec12f97f1b8a8f didn't take the NULLRTBLOCK state into account, which means that it tries to translate that into a realtime extent number. We then end up with an obnoxiously high rtx number and pointlessly feed that to the near allocator. This often fails and falls back to the by-size allocator. Seeing as we had no locality hint anyway, this is a waste of time. Fix the code to detect a lack of bno_hint correctly. This was detected by running xfs/009 with metadir enabled and a 28k rt extent size. Cc: # v6.12 Fixes: ec12f97f1b8a8f ("xfs: make the rtalloc start hint a xfs_rtblock_t") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_rtalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0cb534d71119a..fcfa6e0eb3ad2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1827,7 +1827,7 @@ xfs_rtallocate_rtg( * For an allocation to an empty file at offset 0, pick an extent that * will space things out in the rt area. */ - if (bno_hint) + if (bno_hint != NULLFSBLOCK) start = xfs_rtb_to_rtx(args.mp, bno_hint); else if (!xfs_has_rtgroups(args.mp) && initial_user_data) start = xfs_rtpick_extent(args.rtg, tp, maxlen); From 23bee6f390a12d0c4c51fefc083704bc5dac377e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:30 -0800 Subject: [PATCH 216/330] xfs: fix error bailout in xfs_rtginode_create smatch reported that we screwed up the error cleanup in this function. Fix it. Cc: # v6.13-rc1 Fixes: ae897e0bed0f54 ("xfs: support creating per-RTG files in growfs") Reported-by: Dan Carpenter Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c index e74bb059f24fa..4f3bfc884aff2 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.c +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -496,7 +496,7 @@ xfs_rtginode_create( error = xfs_metadir_create(&upd, S_IFREG); if (error) - return error; + goto out_cancel; xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type); From 6d7b4bc1c3e00b1a25b7a05141a64337b4629337 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:31 -0800 Subject: [PATCH 217/330] xfs: update btree keys correctly when _insrec splits an inode root block In commit 2c813ad66a72, I partially fixed a bug wherein xfs_btree_insrec would erroneously try to update the parent's key for a block that had been split if we decided to insert the new record into the new block. The solution was to detect this situation and update the in-core key value that we pass up to the caller so that the caller will (eventually) add the new block to the parent level of the tree with the correct key. However, I missed a subtlety about the way inode-rooted btrees work. If the full block was a maximally sized inode root block, we'll solve that fullness by moving the root block's records to a new block, resizing the root block, and updating the root to point to the new block. We don't pass a pointer to the new block to the caller because that work has already been done. The new record will /always/ land in the new block, so in this case we need to use xfs_btree_update_keys to update the keys. This bug can theoretically manifest itself in the very rare case that we split a bmbt root block and the new record lands in the very first slot of the new block, though I've never managed to trigger it in practice. However, it is very easy to reproduce by running generic/522 with the realtime rmapbt patchset if rtinherit=1. Cc: # v4.8 Fixes: 2c813ad66a7218 ("xfs: support btrees with overlapping intervals for keys") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c748866ef9236..68ee1c299c25f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -3557,14 +3557,31 @@ xfs_btree_insrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we just inserted into a new tree block, we have to - * recalculate nkey here because nkey is out of date. + * Update btree keys to reflect the newly added record or keyptr. + * There are three cases here to be aware of. Normally, all we have to + * do is walk towards the root, updating keys as necessary. * - * Otherwise we're just updating an existing block (having shoved - * some records into the new tree block), so use the regular key - * update mechanism. + * If the caller had us target a full block for the insertion, we dealt + * with that by calling the _make_block_unfull function. If the + * "make unfull" function splits the block, it'll hand us back the key + * and pointer of the new block. We haven't yet added the new block to + * the next level up, so if we decide to add the new record to the new + * block (bp->b_bn != old_bn), we have to update the caller's pointer + * so that the caller adds the new block with the correct key. + * + * However, there is a third possibility-- if the selected block is the + * root block of an inode-rooted btree and cannot be expanded further, + * the "make unfull" function moves the root block contents to a new + * block and updates the root block to point to the new block. In this + * case, no block pointer is passed back because the block has already + * been added to the btree. In this case, we need to use the regular + * key update function, just like the first case. This is critical for + * overlapping btrees, because the high key must be updated to reflect + * the entire tree, not just the subtree accessible through the first + * child of the root (which is now two levels down from the root). */ - if (bp && xfs_buf_daddr(bp) != old_bn) { + if (!xfs_btree_ptr_is_null(cur, &nptr) && + bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); From ffc3ea4f3c1cc83a86b7497b0c4b0aee7de5480d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:32 -0800 Subject: [PATCH 218/330] xfs: fix scrub tracepoints when inode-rooted btrees are involved Fix a minor mistakes in the scrub tracepoints that can manifest when inode-rooted btrees are enabled. The existing code worked fine for bmap btrees, but we should tighten the code up to be less sloppy. Cc: # v5.7 Fixes: 92219c292af8dd ("xfs: convert btree cursor inode-private member names") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 9b38f5ad1eaf0..d2ae7e93acb08 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -605,7 +605,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; - __entry->ino = sc->ip->i_ino; + __entry->ino = cur->bc_ino.ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __assign_str(name); From 53b001a21c9dff73b64e8c909c41991f01d5d00f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:33 -0800 Subject: [PATCH 219/330] xfs: unlock inodes when erroring out of xfs_trans_alloc_dir Debugging a filesystem patch with generic/475 caused the system to hang after observing the following sequences in dmesg: XFS (dm-0): metadata I/O error in "xfs_imap_to_bp+0x61/0xe0 [xfs]" at daddr 0x491520 len 32 error 5 XFS (dm-0): metadata I/O error in "xfs_btree_read_buf_block+0xba/0x160 [xfs]" at daddr 0x3445608 len 8 error 5 XFS (dm-0): metadata I/O error in "xfs_imap_to_bp+0x61/0xe0 [xfs]" at daddr 0x138e1c0 len 32 error 5 XFS (dm-0): log I/O error -5 XFS (dm-0): Metadata I/O Error (0x1) detected at xfs_trans_read_buf_map+0x1ea/0x4b0 [xfs] (fs/xfs/xfs_trans_buf.c:311). Shutting down filesystem. XFS (dm-0): Please unmount the filesystem and rectify the problem(s) XFS (dm-0): Internal error dqp->q_ino.reserved < dqp->q_ino.count at line 869 of file fs/xfs/xfs_trans_dquot.c. Caller xfs_trans_dqresv+0x236/0x440 [xfs] XFS (dm-0): Corruption detected. Unmount and run xfs_repair XFS (dm-0): Unmounting Filesystem be6bcbcc-9921-4deb-8d16-7cc94e335fa7 The system is stuck in unmount trying to lock a couple of inodes so that they can be purged. The dquot corruption notice above is a clue to what happened -- a link() call tried to set up a transaction to link a child into a directory. Quota reservation for the transaction failed after IO errors shut down the filesystem, but then we forgot to unlock the inodes on our way out. Fix that. Cc: # v6.10 Fixes: bd5562111d5839 ("xfs: Hold inode locks in xfs_trans_alloc_dir") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 30fbed27cf05c..05b18e30368e4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1435,5 +1435,8 @@ xfs_trans_alloc_dir( out_cancel: xfs_trans_cancel(tp); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (dp != ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } From 44d9b07e52db25035680713c3428016cadcd2ea1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:33 -0800 Subject: [PATCH 220/330] xfs: only run precommits once per transaction object Committing a transaction tx0 with a defer ops chain of (A, B, C) creates a chain of transactions that looks like this: tx0 -> txA -> txB -> txC Prior to commit cb042117488dbf, __xfs_trans_commit would run precommits on tx0, then call xfs_defer_finish_noroll to convert A-C to tx[A-C]. Unfortunately, after the finish_noroll loop we forgot to run precommits on txC. That was fixed by adding the second precommit call. Unfortunately, none of us remembered that xfs_defer_finish_noroll calls __xfs_trans_commit a second time to commit tx0 before finishing work A in txA and committing that. In other words, we run precommits twice on tx0: xfs_trans_commit(tx0) __xfs_trans_commit(tx0, false) xfs_trans_run_precommits(tx0) xfs_defer_finish_noroll(tx0) xfs_trans_roll(tx0) txA = xfs_trans_dup(tx0) __xfs_trans_commit(tx0, true) xfs_trans_run_precommits(tx0) This currently isn't an issue because the inode item precommit is idempotent; the iunlink item precommit deletes itself so it can't be called again; and the buffer/dquot item precommits only check the incore objects for corruption. However, it doesn't make sense to run precommits twice. Fix this situation by only running precommits after finish_noroll. Cc: # v6.4 Fixes: cb042117488dbf ("xfs: defered work could create precommits") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 05b18e30368e4..4a517250efc91 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -860,13 +860,6 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); - error = xfs_trans_run_precommits(tp); - if (error) { - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) - xfs_defer_cancel(tp); - goto out_unreserve; - } - /* * Finish deferred items on final commit. Only permanent transactions * should ever have deferred ops. @@ -877,13 +870,12 @@ __xfs_trans_commit( error = xfs_defer_finish_noroll(&tp); if (error) goto out_unreserve; - - /* Run precommits from final tx in defer chain. */ - error = xfs_trans_run_precommits(tp); - if (error) - goto out_unreserve; } + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; + /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the From a004afdc62946d3261f724c6472997085c4f0735 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:34 -0800 Subject: [PATCH 221/330] xfs: avoid nested calls to __xfs_trans_commit Currently, __xfs_trans_commit calls xfs_defer_finish_noroll, which calls __xfs_trans_commit again on the same transaction. In other words, there's a nested function call (albeit with slightly different arguments) that has caused minor amounts of confusion in the past. There's no reason to keep this around, since there's only one place where we actually want the xfs_defer_finish_noroll, and that is in the top level xfs_trans_commit call. This also reduces stack usage a little bit. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 4a517250efc91..26bb2343082af 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -860,18 +860,6 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); - /* - * Finish deferred items on final commit. Only permanent transactions - * should ever have deferred ops. - */ - WARN_ON_ONCE(!list_empty(&tp->t_dfops) && - !(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - if (!regrant && (tp->t_flags & XFS_TRANS_PERM_LOG_RES)) { - error = xfs_defer_finish_noroll(&tp); - if (error) - goto out_unreserve; - } - error = xfs_trans_run_precommits(tp); if (error) goto out_unreserve; @@ -950,6 +938,20 @@ int xfs_trans_commit( struct xfs_trans *tp) { + /* + * Finish deferred items on final commit. Only permanent transactions + * should ever have deferred ops. + */ + WARN_ON_ONCE(!list_empty(&tp->t_dfops) && + !(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) { + int error = xfs_defer_finish_noroll(&tp); + if (error) { + xfs_trans_cancel(tp); + return error; + } + } + return __xfs_trans_commit(tp, false); } From 3762113b597fa600d4e03300eec048256c546b1c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:35 -0800 Subject: [PATCH 222/330] xfs: don't lose solo superblock counter update transactions Superblock counter updates are tracked via per-transaction counters in the xfs_trans object. These changes are then turned into dirty log items in xfs_trans_apply_sb_deltas just prior to commiting the log items to the CIL. However, updating the per-transaction counter deltas do not cause XFS_TRANS_DIRTY to be set on the transaction. In other words, a pure sb counter update will be silently discarded if there are no other dirty log items attached to the transaction. This is currently not the case anywhere in the filesystem because sb counter updates always dirty at least one other metadata item, but let's not leave a logic bomb. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 26bb2343082af..427a8ba0ab99e 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -860,6 +860,13 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); + /* + * Commit per-transaction changes that are not already tracked through + * log items. This can add dirty log items to the transaction. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + error = xfs_trans_run_precommits(tp); if (error) goto out_unreserve; @@ -890,8 +897,6 @@ __xfs_trans_commit( /* * If we need to update the superblock, then do it now. */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); xlog_cil_commit(log, tp, &commit_seq, regrant); From 07137e925fa951646325762bda6bd2503dfe64c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:36 -0800 Subject: [PATCH 223/330] xfs: don't lose solo dquot update transactions Quota counter updates are tracked via incore objects which hang off the xfs_trans object. These changes are then turned into dirty log items in xfs_trans_apply_dquot_deltas just prior to commiting the log items to the CIL. However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be set on the transaction. In other words, a pure quota counter update will be silently discarded if there are no other dirty log items attached to the transaction. This is currently not the case anywhere in the filesystem because quota updates always dirty at least one other metadata item, but a subsequent bug fix will add dquot log item precommits, so we actually need a dirty dquot log item prior to xfs_trans_run_precommits. Also let's not leave a logic bomb. Cc: # v2.6.35 Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_quota.h | 5 +++-- fs/xfs/xfs_trans.c | 10 +++------- fs/xfs/xfs_trans_dquot.c | 31 ++++++++++++++++++++++++++----- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index fa1317cc396c9..d7565462af3dc 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -101,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *); extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); -extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp, + bool already_locked); int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, bool force); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, @@ -173,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, { } #define xfs_trans_apply_dquot_deltas(tp) -#define xfs_trans_unreserve_and_mod_dquots(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp, a) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, bool force) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 427a8ba0ab99e..4cd25717c9d13 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -866,6 +866,7 @@ __xfs_trans_commit( */ if (tp->t_flags & XFS_TRANS_SB_DIRTY) xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); error = xfs_trans_run_precommits(tp); if (error) @@ -894,11 +895,6 @@ __xfs_trans_commit( ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -924,7 +920,7 @@ __xfs_trans_commit( * the dqinfo portion to be. All that means is that we have some * (non-persistent) quota reservations that need to be unreserved. */ - xfs_trans_unreserve_and_mod_dquots(tp); + xfs_trans_unreserve_and_mod_dquots(tp, true); if (tp->t_ticket) { if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); @@ -1018,7 +1014,7 @@ xfs_trans_cancel( } #endif xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + xfs_trans_unreserve_and_mod_dquots(tp, false); if (tp->t_ticket) { xfs_log_ticket_ungrant(log, tp->t_ticket); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 481ba3dc9f190..713b6d243e563 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -606,6 +606,24 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); + + /* + * We've applied the count changes and given back + * whatever reservation we didn't use. Zero out the + * dqtrx fields. + */ + qtrx->qt_blk_res = 0; + qtrx->qt_bcount_delta = 0; + qtrx->qt_delbcnt_delta = 0; + + qtrx->qt_rtblk_res = 0; + qtrx->qt_rtblk_res_used = 0; + qtrx->qt_rtbcount_delta = 0; + qtrx->qt_delrtb_delta = 0; + + qtrx->qt_ino_res = 0; + qtrx->qt_ino_res_used = 0; + qtrx->qt_icount_delta = 0; } } } @@ -642,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook( */ void xfs_trans_unreserve_and_mod_dquots( - struct xfs_trans *tp) + struct xfs_trans *tp, + bool already_locked) { int i, j; struct xfs_dquot *dqp; @@ -671,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = false; + locked = already_locked; if (qtrx->qt_blk_res) { - xfs_dqlock(dqp); - locked = true; + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } dqp->q_blk.reserved -= (xfs_qcnt_t)qtrx->qt_blk_res; } @@ -695,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots( dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } - if (locked) + if (locked && !already_locked) xfs_dqunlock(dqp); } From a40fe30868ba433ac08376e30132400bec067583 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:36 -0800 Subject: [PATCH 224/330] xfs: separate dquot buffer reads from xfs_dqflush The first step towards holding the dquot buffer in the li_buf instead of reading it in the AIL is to separate the part that reads the buffer from the actual flush code. There should be no functional changes. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 57 ++++++++++++++++++++++++++--------------- fs/xfs/xfs_dquot.h | 4 ++- fs/xfs/xfs_dquot_item.c | 20 ++++++++++++--- fs/xfs/xfs_qm.c | 37 +++++++++++++++++++++----- 4 files changed, 86 insertions(+), 32 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ff982d983989b..6ec4087e38dfc 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1238,6 +1238,42 @@ xfs_qm_dqflush_check( return NULL; } +/* + * Get the buffer containing the on-disk dquot. + * + * Requires dquot flush lock, will clear the dirty flag, delete the quota log + * item from the AIL, and shut down the system if something goes wrong. + */ +int +xfs_dquot_read_buf( + struct xfs_trans *tp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp = NULL; + int error; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); + if (error == -EAGAIN) + return error; + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); + if (error) + goto out_abort; + + *bpp = bp; + return 0; + +out_abort: + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; + xfs_trans_ail_delete(&dqp->q_logitem.qli_item, 0); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. @@ -1249,11 +1285,10 @@ xfs_qm_dqflush_check( int xfs_qm_dqflush( struct xfs_dquot *dqp, - struct xfs_buf **bpp) + struct xfs_buf *bp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_log_item *lip = &dqp->q_logitem.qli_item; - struct xfs_buf *bp; struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1263,28 +1298,12 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - *bpp = NULL; - xfs_qm_dqunpin_wait(dqp); - /* - * Get the buffer containing the on-disk dquot - */ - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, - &bp, &xfs_dquot_buf_ops); - if (error == -EAGAIN) - goto out_unlock; - if (xfs_metadata_is_sick(error)) - xfs_dquot_mark_sick(dqp); - if (error) - goto out_abort; - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); - xfs_buf_relse(bp); xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; @@ -1334,14 +1353,12 @@ xfs_qm_dqflush( } trace_xfs_dqflush_done(dqp); - *bpp = bp; return 0; out_abort: dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -out_unlock: xfs_dqfunlock(dqp); return error; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index d73d179df0095..50f8404c41176 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -214,7 +214,9 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); -int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); +int xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp, + struct xfs_buf **bpp); +int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 7d19091215b08..56ecc5ed01934 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -155,14 +155,26 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, &bp); + if (error) { + if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; + xfs_dqfunlock(dqp); + goto out_relock_ail; + } + + /* + * dqflush completes dqflock on error, and the delwri ioend does it on + * success. + */ + error = xfs_qm_dqflush(dqp, bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; - xfs_buf_relse(bp); - } else if (error == -EAGAIN) - rval = XFS_ITEM_LOCKED; + } + xfs_buf_relse(bp); +out_relock_ail: spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_dqunlock(dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 3c0189831f14d..d9ac50a33c57f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -148,17 +148,28 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, &bp); + if (error == -EAGAIN) { + xfs_dqfunlock(dqp); + dqp->q_flags &= ~XFS_DQFLAG_FREEING; + goto out_unlock; + } + if (error) + goto out_funlock; + + /* + * dqflush completes dqflock on error, and the bwrite ioend + * does it on success. + */ + error = xfs_qm_dqflush(dqp, bp); if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); - } else if (error == -EAGAIN) { - dqp->q_flags &= ~XFS_DQFLAG_FREEING; - goto out_unlock; } xfs_dqflock(dqp); } +out_funlock: ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); @@ -494,7 +505,17 @@ xfs_qm_dquot_isolate( /* we have to drop the LRU lock to flush the dquot */ spin_unlock(&lru->lock); - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, &bp); + if (error) { + xfs_dqfunlock(dqp); + goto out_unlock_dirty; + } + + /* + * dqflush completes dqflock on error, and the delwri ioend + * does it on success. + */ + error = xfs_qm_dqflush(dqp, bp); if (error) goto out_unlock_dirty; @@ -1489,11 +1510,13 @@ xfs_qm_flush_one( goto out_unlock; } - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, &bp); if (error) goto out_unlock; - xfs_buf_delwri_queue(bp, buffer_list); + error = xfs_qm_dqflush(dqp, bp); + if (!error) + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); From ec88b41b932d5731291dcc0d0d63ea13ab8e07d5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:37 -0800 Subject: [PATCH 225/330] xfs: clean up log item accesses in xfs_qm_dqflush{,_done} Clean up these functions a little bit before we move on to the real modifications, and make the variable naming consistent for dquot log items. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 6ec4087e38dfc..1dc85de58e59e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1142,8 +1142,9 @@ static void xfs_qm_dqflush_done( struct xfs_log_item *lip) { - struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; - struct xfs_dquot *dqp = qip->qli_dquot; + struct xfs_dq_logitem *qlip = + container_of(lip, struct xfs_dq_logitem, qli_item); + struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; xfs_lsn_t tail_lsn; @@ -1156,12 +1157,12 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && - ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { spin_lock(&ailp->ail_lock); xfs_clear_li_failed(lip); - if (lip->li_lsn == qip->qli_flush_lsn) { + if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); @@ -1319,7 +1320,7 @@ xfs_qm_dqflush( dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, - &dqp->q_logitem.qli_item.li_lsn); + &lip->li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory @@ -1331,7 +1332,7 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_has_crc(mp)) { - dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + dqblk->dd_lsn = cpu_to_be64(lip->li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -1341,7 +1342,7 @@ xfs_qm_dqflush( * the AIL and release the flush lock once the dquot is synced to disk. */ bp->b_flags |= _XBF_DQUOTS; - list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); + list_add_tail(&lip->li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't From acc8f8628c3737108f36e5637f4d5daeaf96d90e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:38 -0800 Subject: [PATCH 226/330] xfs: attach dquot buffer to dquot log item buffer Ever since 6.12-rc1, I've observed a pile of warnings from the kernel when running fstests with quotas enabled: WARNING: CPU: 1 PID: 458580 at mm/page_alloc.c:4221 __alloc_pages_noprof+0xc9c/0xf18 CPU: 1 UID: 0 PID: 458580 Comm: xfsaild/sda3 Tainted: G W 6.12.0-rc6-djwa #rc6 6ee3e0e531f6457e2d26aa008a3b65ff184b377c Call trace: __alloc_pages_noprof+0xc9c/0xf18 alloc_pages_mpol_noprof+0x94/0x240 alloc_pages_noprof+0x68/0xf8 new_slab+0x3e0/0x568 ___slab_alloc+0x5a0/0xb88 __slab_alloc.constprop.0+0x7c/0xf8 __kmalloc_noprof+0x404/0x4d0 xfs_buf_get_map+0x594/0xde0 [xfs 384cb02810558b4c490343c164e9407332118f88] xfs_buf_read_map+0x64/0x2e0 [xfs 384cb02810558b4c490343c164e9407332118f88] xfs_trans_read_buf_map+0x1dc/0x518 [xfs 384cb02810558b4c490343c164e9407332118f88] xfs_qm_dqflush+0xac/0x468 [xfs 384cb02810558b4c490343c164e9407332118f88] xfs_qm_dquot_logitem_push+0xe4/0x148 [xfs 384cb02810558b4c490343c164e9407332118f88] xfsaild+0x3f4/0xde8 [xfs 384cb02810558b4c490343c164e9407332118f88] kthread+0x110/0x128 ret_from_fork+0x10/0x20 ---[ end trace 0000000000000000 ]--- This corresponds to the line: WARN_ON_ONCE(current->flags & PF_MEMALLOC); within the NOFAIL checks. What's happening here is that the XFS AIL is trying to write a disk quota update back into the filesystem, but for that it needs to read the ondisk buffer for the dquot. The buffer is not in memory anymore, probably because it was evicted. Regardless, the buffer cache tries to allocate a new buffer, but those allocations are NOFAIL. The AIL thread has marked itself PF_MEMALLOC (aka noreclaim) since commit 43ff2122e6492b ("xfs: on-stack delayed write buffer lists") presumably because reclaim can push on XFS to push on the AIL. An easy way to fix this probably would have been to drop the NOFAIL flag from the xfs_buf allocation and open code a retry loop, but then there's still the problem that for bs>ps filesystems, the buffer itself could require up to 64k worth of pages. Inode items had similar behavior (multi-page cluster buffers that we don't want to allocate in the AIL) which we solved by making transaction precommit attach the inode cluster buffers to the dirty log item. Let's solve the dquot problem in the same way. So: Make a real precommit handler to read the dquot buffer and attach it to the log item; pass it to dqflush in the push method; and have the iodone function detach the buffer once we've flushed everything. Add a state flag to the log item to track when a thread has entered the precommit -> push mechanism to skip the detaching if it turns out that the dquot is very busy, as we don't hold the dquot lock between log item commit and AIL push). Reading and attaching the dquot buffer in the precommit hook is inspired by the work done for inode cluster buffers some time ago. Cc: # v6.12 Fixes: 903edea6c53f09 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 130 ++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_dquot.h | 6 +- fs/xfs/xfs_dquot_item.c | 39 +++++++----- fs/xfs/xfs_dquot_item.h | 7 +++ fs/xfs/xfs_qm.c | 9 ++- fs/xfs/xfs_trans_ail.c | 2 +- 6 files changed, 169 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1dc85de58e59e..708fd3358375f 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -68,6 +68,30 @@ xfs_dquot_mark_sick( } } +/* + * Detach the dquot buffer if it's still attached, because we can get called + * through dqpurge after a log shutdown. Caller must hold the dqflock or have + * otherwise isolated the dquot. + */ +void +xfs_dquot_detach_buf( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_buf *bp = NULL; + + spin_lock(&qlip->qli_lock); + if (qlip->qli_item.li_buf) { + bp = qlip->qli_item.li_buf; + qlip->qli_item.li_buf = NULL; + } + spin_unlock(&qlip->qli_lock); + if (bp) { + list_del_init(&qlip->qli_item.li_bio_list); + xfs_buf_rele(bp); + } +} + /* * This is called to free all the memory associated with a dquot */ @@ -76,6 +100,7 @@ xfs_qm_dqdestroy( struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); + ASSERT(dqp->q_logitem.qli_item.li_buf == NULL); kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); @@ -1146,6 +1171,7 @@ xfs_qm_dqflush_done( container_of(lip, struct xfs_dq_logitem, qli_item); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; + struct xfs_buf *bp = NULL; xfs_lsn_t tail_lsn; /* @@ -1171,6 +1197,20 @@ xfs_qm_dqflush_done( } } + /* + * If this dquot hasn't been dirtied since initiating the last dqflush, + * release the buffer reference. We already unlinked this dquot item + * from the buffer. + */ + spin_lock(&qlip->qli_lock); + if (!qlip->qli_dirty) { + bp = lip->li_buf; + lip->li_buf = NULL; + } + spin_unlock(&qlip->qli_lock); + if (bp) + xfs_buf_rele(bp); + /* * Release the dq's flush lock since we're done with it. */ @@ -1197,7 +1237,7 @@ xfs_buf_dquot_io_fail( spin_lock(&bp->b_mount->m_ail->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_set_li_failed(lip, bp); + set_bit(XFS_LI_FAILED, &lip->li_flags); spin_unlock(&bp->b_mount->m_ail->ail_lock); } @@ -1249,6 +1289,7 @@ int xfs_dquot_read_buf( struct xfs_trans *tp, struct xfs_dquot *dqp, + xfs_buf_flags_t xbf_flags, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; @@ -1256,7 +1297,7 @@ xfs_dquot_read_buf( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + mp->m_quotainfo->qi_dqchunklen, xbf_flags, &bp, &xfs_dquot_buf_ops); if (error == -EAGAIN) return error; @@ -1275,6 +1316,77 @@ xfs_dquot_read_buf( return error; } +/* + * Attach a dquot buffer to this dquot to avoid allocating a buffer during a + * dqflush, since dqflush can be called from reclaim context. + */ +int +xfs_dquot_attach_buf( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_log_item *lip = &qlip->qli_item; + int error; + + spin_lock(&qlip->qli_lock); + if (!lip->li_buf) { + struct xfs_buf *bp = NULL; + + spin_unlock(&qlip->qli_lock); + error = xfs_dquot_read_buf(tp, dqp, 0, &bp); + if (error) + return error; + + /* + * Attach the dquot to the buffer so that the AIL does not have + * to read the dquot buffer to push this item. + */ + xfs_buf_hold(bp); + spin_lock(&qlip->qli_lock); + lip->li_buf = bp; + xfs_trans_brelse(tp, bp); + } + qlip->qli_dirty = true; + spin_unlock(&qlip->qli_lock); + + return 0; +} + +/* + * Get a new reference the dquot buffer attached to this dquot for a dqflush + * operation. + * + * Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked + * bp; or -EAGAIN if the buffer could not be locked. + */ +int +xfs_dquot_use_attached_buf( + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp = dqp->q_logitem.qli_item.li_buf; + + /* + * A NULL buffer can happen if the dquot dirty flag was set but the + * filesystem shut down before transaction commit happened. In that + * case we're not going to flush anyway. + */ + if (!bp) { + ASSERT(xfs_is_shutdown(dqp->q_mount)); + + *bpp = NULL; + return 0; + } + + if (!xfs_buf_trylock(bp)) + return -EAGAIN; + + xfs_buf_hold(bp); + *bpp = bp; + return 0; +} + /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. @@ -1289,7 +1401,8 @@ xfs_qm_dqflush( struct xfs_buf *bp) { struct xfs_mount *mp = dqp->q_mount; - struct xfs_log_item *lip = &dqp->q_logitem.qli_item; + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_log_item *lip = &qlip->qli_item; struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1319,8 +1432,15 @@ xfs_qm_dqflush( */ dqp->q_flags &= ~XFS_DQFLAG_DIRTY; - xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, - &lip->li_lsn); + /* + * We hold the dquot lock, so nobody can dirty it while we're + * scheduling the write out. Clear the dirty-since-flush flag. + */ + spin_lock(&qlip->qli_lock); + qlip->qli_dirty = false; + spin_unlock(&qlip->qli_lock); + + xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 50f8404c41176..c7e80fc908238 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -215,7 +215,7 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp, - struct xfs_buf **bpp); + xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); @@ -239,6 +239,10 @@ void xfs_dqlockn(struct xfs_dqtrx *q); void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); +int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp); +int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp); +void xfs_dquot_detach_buf(struct xfs_dquot *dqp); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 56ecc5ed01934..271b195ebb932 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -123,8 +123,9 @@ xfs_qm_dquot_logitem_push( __releases(&lip->li_ailp->ail_lock) __acquires(&lip->li_ailp->ail_lock) { - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = lip->li_buf; + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + struct xfs_buf *bp; uint rval = XFS_ITEM_SUCCESS; int error; @@ -155,11 +156,10 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); - error = xfs_dquot_read_buf(NULL, dqp, &bp); - if (error) { - if (error == -EAGAIN) - rval = XFS_ITEM_LOCKED; + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (error == -EAGAIN) { xfs_dqfunlock(dqp); + rval = XFS_ITEM_LOCKED; goto out_relock_ail; } @@ -207,12 +207,10 @@ xfs_qm_dquot_logitem_committing( } #ifdef DEBUG_EXPENSIVE -static int -xfs_qm_dquot_logitem_precommit( - struct xfs_trans *tp, - struct xfs_log_item *lip) +static void +xfs_qm_dquot_logitem_precommit_check( + struct xfs_dquot *dqp) { - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_mount *mp = dqp->q_mount; struct xfs_disk_dquot ddq = { }; xfs_failaddr_t fa; @@ -228,13 +226,24 @@ xfs_qm_dquot_logitem_precommit( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ASSERT(fa == NULL); } - - return 0; } #else -# define xfs_qm_dquot_logitem_precommit NULL +# define xfs_qm_dquot_logitem_precommit_check(...) ((void)0) #endif +static int +xfs_qm_dquot_logitem_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + + xfs_qm_dquot_logitem_precommit_check(dqp); + + return xfs_dquot_attach_buf(tp, dqp); +} + static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_precommit = xfs_qm_dquot_logitem_precommit, @@ -259,5 +268,7 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); + spin_lock_init(&lp->qli_lock); lp->qli_dquot = dqp; + lp->qli_dirty = false; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 794710c244749..d66e52807d76d 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -14,6 +14,13 @@ struct xfs_dq_logitem { struct xfs_log_item qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ + + /* + * We use this spinlock to coordinate access to the li_buf pointer in + * the log item and the qli_dirty flag. + */ + spinlock_t qli_lock; + bool qli_dirty; /* dirtied since last flush? */ }; void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d9ac50a33c57f..7d07d4b5c3398 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -148,7 +148,7 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_dquot_read_buf(NULL, dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp); if (error == -EAGAIN) { xfs_dqfunlock(dqp); dqp->q_flags &= ~XFS_DQFLAG_FREEING; @@ -168,6 +168,7 @@ xfs_qm_dqpurge( } xfs_dqflock(dqp); } + xfs_dquot_detach_buf(dqp); out_funlock: ASSERT(atomic_read(&dqp->q_pincount) == 0); @@ -505,7 +506,7 @@ xfs_qm_dquot_isolate( /* we have to drop the LRU lock to flush the dquot */ spin_unlock(&lru->lock); - error = xfs_dquot_read_buf(NULL, dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp); if (error) { xfs_dqfunlock(dqp); goto out_unlock_dirty; @@ -523,6 +524,8 @@ xfs_qm_dquot_isolate( xfs_buf_relse(bp); goto out_unlock_dirty; } + + xfs_dquot_detach_buf(dqp); xfs_dqfunlock(dqp); /* @@ -1510,7 +1513,7 @@ xfs_qm_flush_one( goto out_unlock; } - error = xfs_dquot_read_buf(NULL, dqp, &bp); + error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 8ede9d099d1fe..f56d62dced97b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -360,7 +360,7 @@ xfsaild_resubmit_item( /* protected by ail_lock */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (bp->b_flags & _XBF_INODES) + if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS)) clear_bit(XFS_LI_FAILED, &lip->li_flags); else xfs_clear_li_failed(lip); From ca378189fdfa890a4f0622f85ee41b710bbac271 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:39 -0800 Subject: [PATCH 227/330] xfs: convert quotacheck to attach dquot buffers Now that we've converted the dquot logging machinery to attach the dquot buffer to the li_buf pointer so that the AIL dqflush doesn't have to allocate or read buffers in a reclaim path, do the same for the quotacheck code so that the reclaim shrinker dqflush call doesn't have to do that either. Cc: # v6.12 Fixes: 903edea6c53f09 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 9 +++------ fs/xfs/xfs_dquot.h | 2 -- fs/xfs/xfs_qm.c | 18 +++++++++++++----- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 708fd3358375f..f11d475898f28 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1285,11 +1285,10 @@ xfs_qm_dqflush_check( * Requires dquot flush lock, will clear the dirty flag, delete the quota log * item from the AIL, and shut down the system if something goes wrong. */ -int +static int xfs_dquot_read_buf( struct xfs_trans *tp, struct xfs_dquot *dqp, - xfs_buf_flags_t xbf_flags, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; @@ -1297,10 +1296,8 @@ xfs_dquot_read_buf( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, xbf_flags, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - if (error == -EAGAIN) - return error; if (xfs_metadata_is_sick(error)) xfs_dquot_mark_sick(dqp); if (error) @@ -1334,7 +1331,7 @@ xfs_dquot_attach_buf( struct xfs_buf *bp = NULL; spin_unlock(&qlip->qli_lock); - error = xfs_dquot_read_buf(tp, dqp, 0, &bp); + error = xfs_dquot_read_buf(tp, dqp, &bp); if (error) return error; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c7e80fc908238..c617bac75361b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -214,8 +214,6 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); -int xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp, - xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 7d07d4b5c3398..69b70c3e999d7 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -148,13 +148,13 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); if (error == -EAGAIN) { xfs_dqfunlock(dqp); dqp->q_flags &= ~XFS_DQFLAG_FREEING; goto out_unlock; } - if (error) + if (!bp) goto out_funlock; /* @@ -506,8 +506,8 @@ xfs_qm_dquot_isolate( /* we have to drop the LRU lock to flush the dquot */ spin_unlock(&lru->lock); - error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp); - if (error) { + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (!bp || error == -EAGAIN) { xfs_dqfunlock(dqp); goto out_unlock_dirty; } @@ -1331,6 +1331,10 @@ xfs_qm_quotacheck_dqadjust( return error; } + error = xfs_dquot_attach_buf(NULL, dqp); + if (error) + return error; + trace_xfs_dqadjust(dqp); /* @@ -1513,9 +1517,13 @@ xfs_qm_flush_one( goto out_unlock; } - error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); if (error) goto out_unlock; + if (!bp) { + error = -EFSCORRUPTED; + goto out_unlock; + } error = xfs_qm_dqflush(dqp, bp); if (!error) From 7f8a44f37229fc76bfcafa341a4b8862368ef44a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:39 -0800 Subject: [PATCH 228/330] xfs: fix sb_spino_align checks for large fsblock sizes For a sparse inodes filesystem, mkfs.xfs computes the values of sb_spino_align and sb_inoalignmt with the following code: int cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; if (cfg->sb_feat.crcs_enabled) cluster_size *= cfg->inodesize / XFS_DINODE_MIN_SIZE; sbp->sb_spino_align = cluster_size >> cfg->blocklog; sbp->sb_inoalignmt = XFS_INODES_PER_CHUNK * cfg->inodesize >> cfg->blocklog; On a V5 filesystem with 64k fsblocks and 512 byte inodes, this results in cluster_size = 8192 * (512 / 256) = 16384. As a result, sb_spino_align and sb_inoalignmt are both set to zero. Unfortunately, this trips the new sb_spino_align check that was just added to xfs_validate_sb_common, and the mkfs fails: # mkfs.xfs -f -b size=64k, /dev/sda meta-data=/dev/sda isize=512 agcount=4, agsize=81136 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=1 = reflink=1 bigtime=1 inobtcount=1 nrext64=1 = exchange=0 metadir=0 data = bsize=65536 blocks=324544, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=65536 ascii-ci=0, ftype=1, parent=0 log =internal log bsize=65536 blocks=5006, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=65536 blocks=0, rtextents=0 = rgcount=0 rgsize=0 extents Discarding blocks...Sparse inode alignment (0) is invalid. Metadata corruption detected at 0x560ac5a80bbe, xfs_sb block 0x0/0x200 libxfs_bwrite: write verifier failed on xfs_sb bno 0x0/0x1 mkfs.xfs: Releasing dirty buffer to free list! found dirty buffer (bulk) on free list! Sparse inode alignment (0) is invalid. Metadata corruption detected at 0x560ac5a80bbe, xfs_sb block 0x0/0x200 libxfs_bwrite: write verifier failed on xfs_sb bno 0x0/0x1 mkfs.xfs: writing AG headers failed, err=22 Prior to commit 59e43f5479cce1 this all worked fine, even if "sparse" inodes are somewhat meaningless when everything fits in a single fsblock. Adjust the checks to handle existing filesystems. Cc: # v6.13-rc1 Fixes: 59e43f5479cce1 ("xfs: sb_spino_align is not verified") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_sb.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a809513a290cf..3b5623611eba0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -494,12 +494,13 @@ xfs_validate_sb_common( return -EINVAL; } - if (!sbp->sb_spino_align || - sbp->sb_spino_align > sbp->sb_inoalignmt || - (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0) { + if (sbp->sb_spino_align && + (sbp->sb_spino_align > sbp->sb_inoalignmt || + (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) { xfs_warn(mp, - "Sparse inode alignment (%u) is invalid.", - sbp->sb_spino_align); +"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).", + sbp->sb_spino_align, + sbp->sb_inoalignmt); return -EINVAL; } } else if (sbp->sb_spino_align) { From 3853b5e1d7ccb83f572df8a12619d1a58d266d6d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:40 -0800 Subject: [PATCH 229/330] xfs: don't move nondir/nonreg temporary repair files to the metadir namespace Only directories or regular files are allowed in the metadata directory tree. Don't move the repair tempfile to the metadir namespace if this is not true; this will cause the inode verifiers to trip. xrep_tempfile_adjust_directory_tree opportunistically moves sc->tempip from the regular directory tree to the metadata directory tree if sc->ip is part of the metadata directory tree. However, the scrub setup functions grab sc->ip and create sc->tempip before we actually get around to checking if the file mode is the right type for the scrubber. IOWs, you can invoke the symlink scrubber with the file handle of a subdirectory in the metadir. xrep_setup_symlink will create a temporary symlink file, xrep_tempfile_adjust_directory_tree will foolishly try to set the METADATA flag on the temp symlink, which trips the inode verifier in the inode item precommit, which shuts down the filesystem when expensive checks are turned on. If they're /not/ turned on, then xchk_symlink will return ENOENT when it sees that it's been passed a symlink, but the invalid inode could still get flushed to disk. We don't want that. Cc: # v6.13-rc1 Fixes: 9dc31acb01a1c7 ("xfs: move repair temporary files to the metadata directory tree") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/tempfile.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index dc3802c7f678c..2d7ca7e1bbca0 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -184,11 +184,18 @@ xrep_tempfile_create( } /* + * Move sc->tempip from the regular directory tree to the metadata directory + * tree if sc->ip is part of the metadata directory tree and tempip has an + * eligible file mode. + * * Temporary files have to be created before we even know which inode we're * going to scrub, so we assume that they will be part of the regular directory * tree. If it turns out that we're actually scrubbing a file from the * metadata directory tree, we have to subtract the temp file from the root - * dquots and detach the dquots. + * dquots and detach the dquots prior to setting the METADATA iflag. However, + * the scrub setup functions grab sc->ip and create sc->tempip before we + * actually get around to checking if the file mode is the right type for the + * scrubber. */ int xrep_tempfile_adjust_directory_tree( @@ -204,6 +211,9 @@ xrep_tempfile_adjust_directory_tree( if (!sc->ip || !xfs_is_metadir_inode(sc->ip)) return 0; + if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) && + !S_ISREG(VFS_I(sc->tempip)->i_mode)) + return 0; xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; From e57e083be9b9bc5c341e1245b988f290c09a5ed7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:41 -0800 Subject: [PATCH 230/330] xfs: don't crash on corrupt /quotas dirent If the /quotas dirent points to an inode but the inode isn't loadable (and hence mkdir returns -EEXIST), don't crash, just bail out. Cc: # v6.13-rc1 Fixes: e80fbe1ad8eff7 ("xfs: use metadir for quota inodes") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_qm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 69b70c3e999d7..dc8b1010d4d33 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -731,6 +731,13 @@ xfs_qm_create_metadir_qinos( error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip); if (error && error != -EEXIST) return error; + /* + * If the /quotas dirent points to an inode that isn't + * loadable, qi_dirip will be NULL but mkdir_parent will return + * -EEXIST. In this case the metadir is corrupt, so bail out. + */ + if (XFS_IS_CORRUPT(mp, qi->qi_dirip == NULL)) + return -EFSCORRUPTED; } if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) { From 06b20ef09ba16374e1e68f1e8dbe434c4ad4e6fd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:42 -0800 Subject: [PATCH 231/330] xfs: check pre-metadir fields correctly The checks that were added to the superblock scrubber for metadata directories aren't quite right -- the old inode pointers are now defined to be zeroes until someone else reuses them. Also consolidate the new metadir field checks to one place; they were inexplicably scattered around. Cc: # v6.13-rc1 Fixes: 28d756d4d562dc ("xfs: update sb field checks when metadir is turned on") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agheader.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 1d41b85478da9..88063d67cb5fd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -145,8 +145,11 @@ xchk_superblock( xchk_block_set_preen(sc, bp); if (xfs_has_metadir(sc->mp)) { - if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_rbmino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); } else { if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) xchk_block_set_preen(sc, bp); @@ -229,7 +232,13 @@ xchk_superblock( * sb_icount, sb_ifree, sb_fdblocks, sb_frexents */ - if (!xfs_has_metadir(mp)) { + if (xfs_has_metadir(mp)) { + if (sb->sb_uquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(0)) + xchk_block_set_preen(sc, bp); + } else { if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) xchk_block_set_preen(sc, bp); @@ -281,15 +290,8 @@ xchk_superblock( if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok))) xchk_block_set_corrupt(sc, bp); - if (xfs_has_metadir(mp)) { - if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog) - xchk_block_set_corrupt(sc, bp); - if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad))) - xchk_block_set_preen(sc, bp); - } else { - if (sb->sb_features2 != sb->sb_bad_features2) - xchk_block_set_preen(sc, bp); - } + if (sb->sb_features2 != sb->sb_bad_features2) + xchk_block_set_preen(sc, bp); } /* Check sb_features2 flags that are set at mkfs time. */ @@ -351,7 +353,10 @@ xchk_superblock( if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) xchk_block_set_corrupt(sc, bp); - if (!xfs_has_metadir(mp)) { + if (xfs_has_metadir(mp)) { + if (sb->sb_pquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) xchk_block_set_preen(sc, bp); } @@ -366,11 +371,20 @@ xchk_superblock( } if (xfs_has_metadir(mp)) { + if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino)) + xchk_block_set_preen(sc, bp); + if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount)) xchk_block_set_corrupt(sc, bp); if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents)) xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog) + xchk_block_set_corrupt(sc, bp); + + if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad))) + xchk_block_set_corrupt(sc, bp); } /* Everything else must be zero. */ From c004a793e0ec34047c3bd423bcd8966f5fac88dc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:42 -0800 Subject: [PATCH 232/330] xfs: fix zero byte checking in the superblock scrubber The logic to check that the region past the end of the superblock is all zeroes is wrong -- we don't want to check only the bytes past the end of the maximally sized ondisk superblock structure as currently defined in xfs_format.h; we want to check the bytes beyond the end of the ondisk as defined by the feature bits. Port the superblock size logic from xfs_repair and then put it to use in xfs_scrub. Cc: # v4.15 Fixes: 21fb4cb1981ef7 ("xfs: scrub the secondary superblocks") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agheader.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 88063d67cb5fd..9f8c312dfd3c8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -59,6 +59,32 @@ xchk_superblock_xref( /* scrub teardown will take care of sc->sa for us */ } +/* + * Calculate the ondisk superblock size in bytes given the feature set of the + * mounted filesystem (aka the primary sb). This is subtlely different from + * the logic in xfs_repair, which computes the size of a secondary sb given the + * featureset listed in the secondary sb. + */ +STATIC size_t +xchk_superblock_ondisk_size( + struct xfs_mount *mp) +{ + if (xfs_has_metadir(mp)) + return offsetofend(struct xfs_dsb, sb_pad); + if (xfs_has_metauuid(mp)) + return offsetofend(struct xfs_dsb, sb_meta_uuid); + if (xfs_has_crc(mp)) + return offsetofend(struct xfs_dsb, sb_lsn); + if (xfs_sb_version_hasmorebits(&mp->m_sb)) + return offsetofend(struct xfs_dsb, sb_bad_features2); + if (xfs_has_logv2(mp)) + return offsetofend(struct xfs_dsb, sb_logsunit); + if (xfs_has_sector(mp)) + return offsetofend(struct xfs_dsb, sb_logsectsize); + /* only support dirv2 or more recent */ + return offsetofend(struct xfs_dsb, sb_dirblklog); +} + /* * Scrub the filesystem superblock. * @@ -75,6 +101,7 @@ xchk_superblock( struct xfs_buf *bp; struct xfs_dsb *sb; struct xfs_perag *pag; + size_t sblen; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -388,8 +415,8 @@ xchk_superblock( } /* Everything else must be zero. */ - if (memchr_inv(sb + 1, 0, - BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + sblen = xchk_superblock_ondisk_size(mp); + if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen)) xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); From 7f8b718c58783f3ff0810b39e2f62f50ba2549f6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:57:43 -0800 Subject: [PATCH 233/330] xfs: return from xfs_symlink_verify early on V4 filesystems V4 symlink blocks didn't have headers, so return early if this is a V4 filesystem. Cc: # v5.1 Fixes: 39708c20ab5133 ("xfs: miscellaneous verifier magic value fixups") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_symlink_remote.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f228127a88ff2..fb47a76ead18c 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -92,8 +92,10 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; + /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) - return __this_address; + return NULL; + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) From 12f2930f5f91bc0d67794c69d1961098c7c72040 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Dec 2024 10:58:11 -0800 Subject: [PATCH 234/330] xfs: port xfs_ioc_start_commit to multigrain timestamps Take advantage of the multigrain timestamp APIs to ensure that nobody can sneak in and write things to a file between starting a file update operation and committing the results. This should have been part of the multigrain timestamp merge, but I forgot to fling it at jlayton when he resubmitted the patchset due to developer bandwidth problems. Cc: # v6.13-rc1 Fixes: 4e40eff0b5737c ("fs: add infrastructure for multigrain timestamps") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jeff Layton --- fs/xfs/xfs_exchrange.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index 9ab05ad224d12..265c424498933 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -854,7 +854,7 @@ xfs_ioc_start_commit( struct xfs_commit_range __user *argp) { struct xfs_commit_range args = { }; - struct timespec64 ts; + struct kstat kstat = { }; struct xfs_commit_range_fresh *kern_f; struct xfs_commit_range_fresh __user *user_f; struct inode *inode2 = file_inode(file); @@ -871,12 +871,12 @@ xfs_ioc_start_commit( memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); xfs_ilock(ip2, lockflags); - ts = inode_get_ctime(inode2); - kern_f->file2_ctime = ts.tv_sec; - kern_f->file2_ctime_nsec = ts.tv_nsec; - ts = inode_get_mtime(inode2); - kern_f->file2_mtime = ts.tv_sec; - kern_f->file2_mtime_nsec = ts.tv_nsec; + /* Force writing of a distinct ctime if any writes happen. */ + fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2); + kern_f->file2_ctime = kstat.ctime.tv_sec; + kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec; + kern_f->file2_mtime = kstat.mtime.tv_sec; + kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec; kern_f->file2_ino = ip2->i_ino; kern_f->file2_gen = inode2->i_generation; kern_f->magic = XCR_FRESH_MAGIC; From 145ac100b63732291c0612528444d7f5ab593fb2 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 11 Dec 2024 08:05:03 +0100 Subject: [PATCH 235/330] efi/esrt: remove esre_attribute::store() esre_attribute::store() is not needed since commit af97a77bc01c (efi: Move some sysfs files to be read-only by root). Drop it. Found by https://github.com/jirislaby/clang-struct. Signed-off-by: Jiri Slaby (SUSE) Cc: Ard Biesheuvel Cc: linux-efi@vger.kernel.org Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/esrt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c index 7a81c0ce47805..4bb7b0584bc90 100644 --- a/drivers/firmware/efi/esrt.c +++ b/drivers/firmware/efi/esrt.c @@ -75,8 +75,6 @@ static LIST_HEAD(entry_list); struct esre_attribute { struct attribute attr; ssize_t (*show)(struct esre_entry *entry, char *buf); - ssize_t (*store)(struct esre_entry *entry, - const char *buf, size_t count); }; static struct esre_entry *to_entry(struct kobject *kobj) From f9244fb55f37356f75c739c57323d9422d7aa0f8 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 7 Nov 2024 16:17:00 +0100 Subject: [PATCH 236/330] xen/netfront: fix crash when removing device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When removing a netfront device directly after a suspend/resume cycle it might happen that the queues have not been setup again, causing a crash during the attempt to stop the queues another time. Fix that by checking the queues are existing before trying to stop them. This is XSA-465 / CVE-2024-53240. Reported-by: Marek Marczykowski-Górecki Fixes: d50b7914fae0 ("xen-netfront: Fix NULL sring after live migration") Signed-off-by: Juergen Gross --- drivers/net/xen-netfront.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 4265c1cd0ff71..63fe51d0e64db 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -867,7 +867,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev static int xennet_close(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); - unsigned int num_queues = dev->real_num_tx_queues; + unsigned int num_queues = np->queues ? dev->real_num_tx_queues : 0; unsigned int i; struct netfront_queue *queue; netif_tx_stop_all_queues(np->netdev); @@ -882,6 +882,9 @@ static void xennet_destroy_queues(struct netfront_info *info) { unsigned int i; + if (!info->queues) + return; + for (i = 0; i < info->netdev->real_num_tx_queues; i++) { struct netfront_queue *queue = &info->queues[i]; From efbcd61d9bebb771c836a3b8bfced8165633db7c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 08:29:48 +0200 Subject: [PATCH 237/330] x86: make get_cpu_vendor() accessible from Xen code In order to be able to differentiate between AMD and Intel based systems for very early hypercalls without having to rely on the Xen hypercall page, make get_cpu_vendor() non-static. Refactor early_cpu_init() for the same reason by splitting out the loop initializing cpu_devs() into an externally callable function. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/common.c | 38 ++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0975815980c8..20e6009381ed6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -230,6 +230,8 @@ static inline unsigned long long l1tf_pfn_limit(void) return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } +void init_cpu_devs(void); +void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5c28975c608e..3e9037690814b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -867,7 +867,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -static void get_cpu_vendor(struct cpuinfo_x86 *c) +void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -1649,15 +1649,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) detect_nopl(); } -void __init early_cpu_init(void) +void __init init_cpu_devs(void) { const struct cpu_dev *const *cdev; int count = 0; -#ifdef CONFIG_PROCESSOR_SELECT - pr_info("KERNEL supported cpus:\n"); -#endif - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; @@ -1665,20 +1661,30 @@ void __init early_cpu_init(void) break; cpu_devs[count] = cpudev; count++; + } +} +void __init early_cpu_init(void) +{ #ifdef CONFIG_PROCESSOR_SELECT - { - unsigned int j; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - pr_info(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } - } + unsigned int i, j; + + pr_info("KERNEL supported cpus:\n"); #endif + + init_cpu_devs(); + +#ifdef CONFIG_PROCESSOR_SELECT + for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) { + for (j = 0; j < 2; j++) { + if (!cpu_devs[i]->c_ident[j]) + continue; + pr_info(" %s %s\n", cpu_devs[i]->c_vendor, + cpu_devs[i]->c_ident[j]); + } } +#endif + early_identify_cpu(&boot_cpu_data); } From dda014ba59331dee4f3b773a020e109932f4bd24 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 15:47:49 +0100 Subject: [PATCH 238/330] objtool/x86: allow syscall instruction The syscall instruction is used in Xen PV mode for doing hypercalls. Allow syscall to be used in the kernel in case it is tagged with an unwind hint for objtool. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra --- tools/objtool/check.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4ce176ad411fb..76060da755b5c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3820,9 +3820,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_CONTEXT_SWITCH: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; + if (func) { + if (!next_insn || !next_insn->hint) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } + break; } return 0; From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: [PATCH 239/330] x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/static_call.h | 15 ++++++++++++ arch/x86/include/asm/sync_core.h | 6 ++--- arch/x86/kernel/static_call.c | 9 ++++++++ include/linux/compiler.h | 37 +++++++++++++++++++++--------- include/linux/static_call.h | 1 + kernel/static_call_inline.c | 2 +- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 125c407e2abe6..41502bd2afd64 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -65,4 +65,19 @@ extern bool __static_call_fixup(void *tramp, u8 op, void *dest); +extern void __static_call_update_early(void *tramp, void *func); + +#define static_call_update_early(name, _func) \ +({ \ + typeof(&STATIC_CALL_TRAMP(name)) __F = (_func); \ + if (static_call_initialized) { \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), __F);\ + } else { \ + WRITE_ONCE(STATIC_CALL_KEY(name).func, _func); \ + __static_call_update_early(STATIC_CALL_TRAMP_ADDR(name),\ + __F); \ + } \ +}) + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index ab7382f92aff2..96bda43538ee7 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -8,7 +8,7 @@ #include #ifdef CONFIG_X86_32 -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { asm volatile ( "pushfl\n\t" @@ -19,7 +19,7 @@ static inline void iret_to_self(void) : ASM_CALL_CONSTRAINT : : "memory"); } #else -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { unsigned int tmp; @@ -55,7 +55,7 @@ static inline void iret_to_self(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ -static inline void sync_core(void) +static __always_inline void sync_core(void) { /* * The SERIALIZE instruction is the most straightforward way to diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 4eefaac64c6cb..9eed0c144dad5 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,6 +172,15 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); +noinstr void __static_call_update_early(void *tramp, void *func) +{ + BUG_ON(system_state != SYSTEM_BOOTING); + BUG_ON(!early_boot_irqs_disabled); + BUG_ON(static_call_initialized); + __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); + sync_core(); +} + #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495f..240c632c5b957 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,28 +216,43 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -/** - * offset_to_ptr - convert a relative memory offset to an absolute pointer - * @off: the address of the 32-bit offset value - */ -static inline void *offset_to_ptr(const int *off) -{ - return (void *)((unsigned long)off + *off); -} +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; -#endif /* __ASSEMBLY__ */ +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b3..785980af89729 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 5259cda486d05..bb7d066a7c397 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,7 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static int static_call_initialized; +int static_call_initialized; /* * Must be called before early_initcall() to be effective. From a2796dff62d6c6bfc5fbebdf2bee0d5ac0438906 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 16 Oct 2024 10:40:26 +0200 Subject: [PATCH 240/330] x86/xen: don't do PV iret hypercall through hypercall page Instead of jumping to the Xen hypercall page for doing the iret hypercall, directly code the required sequence in xen-asm.S. This is done in preparation of no longer using hypercall page at all, as it has shown to cause problems with speculation mitigations. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- arch/x86/xen/xen-asm.S | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 83189cf5cdce9..ca6edfe4c14b1 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -176,7 +176,6 @@ SYM_CODE_START(xen_early_idt_handler_array) SYM_CODE_END(xen_early_idt_handler_array) __FINIT -hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: * @@ -186,17 +185,28 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * cs * rip <-- standard iret frame * - * flags + * flags <-- xen_iret must push from here on * - * rcx } - * r11 }<-- pushed by hypercall page - * rsp->rax } + * rcx + * r11 + * rsp->rax */ +.macro xen_hypercall_iret + pushq $0 /* Flags */ + push %rcx + push %r11 + push %rax + mov $__HYPERVISOR_iret, %eax + syscall /* Do the IRET. */ +#ifdef CONFIG_MITIGATION_SLS + int3 +#endif +.endm + SYM_CODE_START(xen_iret) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_iret) /* @@ -301,8 +311,7 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_END(xen_entry_SYSCALL_compat) From b53127db1dbf7f1047cf35c10922d801dcd40324 Mon Sep 17 00:00:00 2001 From: "Vineeth Pillai (Google)" Date: Thu, 12 Dec 2024 22:22:36 -0500 Subject: [PATCH 241/330] sched/dlserver: Fix dlserver double enqueue dlserver can get dequeued during a dlserver pick_task due to the delayed deueue feature and this can lead to issues with dlserver logic as it still thinks that dlserver is on the runqueue. The dlserver throttling and replenish logic gets confused and can lead to double enqueue of dlserver. Double enqueue of dlserver could happend due to couple of reasons: Case 1 ------ Delayed dequeue feature[1] can cause dlserver being stopped during a pick initiated by dlserver: __pick_next_task pick_task_dl -> server_pick_task pick_task_fair pick_next_entity (if (sched_delayed)) dequeue_entities dl_server_stop server_pick_task goes ahead with update_curr_dl_se without knowing that dlserver is dequeued and this confuses the logic and may lead to unintended enqueue while the server is stopped. Case 2 ------ A race condition between a task dequeue on one cpu and same task's enqueue on this cpu by a remote cpu while the lock is released causing dlserver double enqueue. One cpu would be in the schedule() and releasing RQ-lock: current->state = TASK_INTERRUPTIBLE(); schedule(); deactivate_task() dl_stop_server(); pick_next_task() pick_next_task_fair() sched_balance_newidle() rq_unlock(this_rq) at which point another CPU can take our RQ-lock and do: try_to_wake_up() ttwu_queue() rq_lock() ... activate_task() dl_server_start() --> first enqueue wakeup_preempt() := check_preempt_wakeup_fair() update_curr() update_curr_task() if (current->dl_server) dl_server_update() enqueue_dl_entity() --> second enqueue This bug was not apparent as the enqueue in dl_server_start doesn't usually happen because of the defer logic. But as a side effect of the first case(dequeue during dlserver pick), dl_throttled and dl_yield will be set and this causes the time accounting of dlserver to messup and then leading to a enqueue in dl_server_start. Have an explicit flag representing the status of dlserver to avoid the confusion. This is set in dl_server_start and reset in dlserver_stop. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Suggested-by: Peter Zijlstra Signed-off-by: "Vineeth Pillai (Google)" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # ROCK 5B Link: https://lkml.kernel.org/r/20241213032244.877029-1-vineeth@bitbyteword.org --- include/linux/sched.h | 7 +++++++ kernel/sched/deadline.c | 8 ++++++-- kernel/sched/sched.h | 5 +++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d380bffee2eff..66b311fbd5d60 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -656,6 +656,12 @@ struct sched_dl_entity { * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. */ @@ -664,6 +670,7 @@ struct sched_dl_entity { unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; + unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index db47f33cb7d2f..d94f2ed6d1f46 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1647,6 +1647,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_se->dl_runtime) return; + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) resched_curr(dl_se->rq); @@ -1661,6 +1662,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, @@ -2421,8 +2423,10 @@ static struct task_struct *__pick_task_dl(struct rq *rq) if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1e494af2cd23d..c5d67a43fe524 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -398,6 +398,11 @@ extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} + #ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; From c7f7e9c73178e0e342486fd31e7f363ef60e3f83 Mon Sep 17 00:00:00 2001 From: "Vineeth Pillai (Google)" Date: Thu, 12 Dec 2024 22:22:37 -0500 Subject: [PATCH 242/330] sched/dlserver: Fix dlserver time accounting dlserver time is accounted when: - dlserver is active and the dlserver proxies the cfs task. - dlserver is active but deferred and cfs task runs after being picked through the normal fair class pick. dl_server_update is called in two places to make sure that both the above times are accounted for. But it doesn't check if dlserver is active or not. Now that we have this dl_server_active flag, we can consolidate dl_server_update into one place and all we need to check is whether dlserver is active or not. When dlserver is active there is only two possible conditions: - dlserver is deferred. - cfs task is running on behalf of dlserver. Fixes: a110a81c52a9 ("sched/deadline: Deferrable dl server") Signed-off-by: "Vineeth Pillai (Google)" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # ROCK 5B Link: https://lore.kernel.org/r/20241213032244.877029-2-vineeth@bitbyteword.org --- kernel/sched/fair.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97ee48c8bf5ed..53a4f78b83407 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1159,8 +1159,6 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) trace_sched_stat_runtime(p, delta_exec); account_group_exec_runtime(p, delta_exec); cgroup_account_cputime(p, delta_exec); - if (p->dl_server) - dl_server_update(p->dl_server, delta_exec); } static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) @@ -1237,11 +1235,16 @@ static void update_curr(struct cfs_rq *cfs_rq) update_curr_task(p, delta_exec); /* - * Any fair task that runs outside of fair_server should - * account against fair_server such that it can account for - * this time and possibly avoid running this period. + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on + * behalf of fair_server or not: + * - If the task is running on behalf of fair_server, we need + * to limit its time based on the assigned runtime. + * - Fair task that runs outside of fair_server should account + * against fair_server such that it can account for this time + * and possibly avoid running this period. */ - if (p->dl_server != &rq->fair_server) + if (dl_server_active(&rq->fair_server)) dl_server_update(&rq->fair_server, delta_exec); } From a3b4647e2f9ae8e8c6829ce637945b3c07a727ad Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 10 Dec 2024 16:09:40 +0000 Subject: [PATCH 243/330] arm64: signal: Ensure signal delivery failure is recoverable Commit eaf62ce1563b ("arm64/signal: Set up and restore the GCS context for signal handlers") introduced a potential failure point at the end of setup_return(). This is unfortunate as it is too late to deliver a SIGSEGV: if that SIGSEGV is handled, the subsequent sigreturn will end up returning to the original handler, which is not the intention (since we failed to deliver that signal). Make sure this does not happen by calling gcs_signal_entry() at the very beginning of setup_return(), and add a comment just after to discourage error cases being introduced from that point onwards. While at it, also take care of copy_siginfo_to_user(): since it may fail, we shouldn't be calling it after setup_return() either. Call it before setup_return() instead, and move the setting of X1/X2 inside setup_return() where it belongs (after the "point of no failure"). Background: the first part of setup_rt_frame(), including setup_sigframe(), has no impact on the execution of the interrupted thread. The signal frame is written to the stack, but the stack pointer remains unchanged. Failure at this stage can be recovered by a SIGSEGV handler, and sigreturn will restore the original context, at the point where the original signal occurred. On the other hand, once setup_return() has updated registers including SP, the thread's control flow has been modified and we must deliver the original signal. Fixes: eaf62ce1563b ("arm64/signal: Set up and restore the GCS context for signal handlers") Signed-off-by: Kevin Brodsky Reviewed-by: Dave Martin Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241210160940.2031997-1-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 48 ++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 14ac6fdb872b9..37e24f1bd2279 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1462,10 +1462,33 @@ static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; + int err; + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; + else + sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + + err = gcs_signal_entry(sigtramp, ksig); + if (err) + return err; + + /* + * We must not fail from this point onwards. We are going to update + * registers, including SP, in order to invoke the signal handler. If + * we failed and attempted to deliver a nested SIGSEGV to a handler + * after that point, the subsequent sigreturn would end up restoring + * the (partial) state for the original signal handler. + */ regs->regs[0] = usig; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + regs->regs[1] = (unsigned long)&user->sigframe->info; + regs->regs[2] = (unsigned long)&user->sigframe->uc; + } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; + regs->regs[30] = (unsigned long)sigtramp; regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* @@ -1506,14 +1529,7 @@ static int setup_return(struct pt_regs *regs, struct ksignal *ksig, sme_smstop(); } - if (ksig->ka.sa.sa_flags & SA_RESTORER) - sigtramp = ksig->ka.sa.sa_restorer; - else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); - - regs->regs[30] = (unsigned long)sigtramp; - - return gcs_signal_entry(sigtramp, ksig); + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -1537,14 +1553,16 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set, &ua_state); - if (err == 0) { + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + err |= copy_siginfo_to_user(&frame->info, &ksig->info); + + if (err == 0) err = setup_return(regs, ksig, &user, usig); - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - regs->regs[1] = (unsigned long)&frame->info; - regs->regs[2] = (unsigned long)&frame->uc; - } - } + + /* + * We must not fail if setup_return() succeeded - see comment at the + * beginning of setup_return(). + */ if (err == 0) set_handler_user_access_state(); From 1f2557e08a617a4b5e92a48a1a9a6f86621def18 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 13 Dec 2024 09:17:50 +0800 Subject: [PATCH 244/330] iommu/vt-d: Remove cache tags before disabling ATS The current implementation removes cache tags after disabling ATS, leading to potential memory leaks and kernel crashes. Specifically, CACHE_TAG_DEVTLB type cache tags may still remain in the list even after the domain is freed, causing a use-after-free condition. This issue really shows up when multiple VFs from different PFs passed through to a single user-space process via vfio-pci. In such cases, the kernel may crash with kernel messages like: BUG: kernel NULL pointer dereference, address: 0000000000000014 PGD 19036a067 P4D 1940a3067 PUD 136c9b067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 74 UID: 0 PID: 3183 Comm: testCli Not tainted 6.11.9 #2 RIP: 0010:cache_tag_flush_range+0x9b/0x250 Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x163/0x590 ? exc_page_fault+0x72/0x190 ? asm_exc_page_fault+0x22/0x30 ? cache_tag_flush_range+0x9b/0x250 ? cache_tag_flush_range+0x5d/0x250 intel_iommu_tlb_sync+0x29/0x40 intel_iommu_unmap_pages+0xfe/0x160 __iommu_unmap+0xd8/0x1a0 vfio_unmap_unpin+0x182/0x340 [vfio_iommu_type1] vfio_remove_dma+0x2a/0xb0 [vfio_iommu_type1] vfio_iommu_type1_ioctl+0xafa/0x18e0 [vfio_iommu_type1] Move cache_tag_unassign_domain() before iommu_disable_pci_caps() to fix it. Fixes: 3b1d9e2b2d68 ("iommu/vt-d: Add cache tag assignment interface") Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241129020506.576413-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7d0acb74d5a54..79e0da9eb626c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3220,6 +3220,9 @@ void device_block_translation(struct device *dev) struct intel_iommu *iommu = info->iommu; unsigned long flags; + if (info->domain) + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); + iommu_disable_pci_caps(info); if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) @@ -3236,7 +3239,6 @@ void device_block_translation(struct device *dev) list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); - cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } From 74536f91962d5f6af0a42414773ce61e653c10ee Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 13 Dec 2024 09:17:51 +0800 Subject: [PATCH 245/330] iommu/vt-d: Fix qi_batch NULL pointer with nested parent domain The qi_batch is allocated when assigning cache tag for a domain. While for nested parent domain, it is missed. Hence, when trying to map pages to the nested parent, NULL dereference occurred. Also, there is potential memleak since there is no lock around domain->qi_batch allocation. To solve it, add a helper for qi_batch allocation, and call it in both the __cache_tag_assign_domain() and __cache_tag_assign_parent_domain(). BUG: kernel NULL pointer dereference, address: 0000000000000200 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8104795067 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 223 UID: 0 PID: 4357 Comm: qemu-system-x86 Not tainted 6.13.0-rc1-00028-g4b50c3c3b998-dirty #2632 Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x80/0x150 ? do_user_addr_fault+0x63/0x7b0 ? exc_page_fault+0x7c/0x220 ? asm_exc_page_fault+0x26/0x30 ? cache_tag_flush_range_np+0x13c/0x260 intel_iommu_iotlb_sync_map+0x1a/0x30 iommu_map+0x61/0xf0 batch_to_domain+0x188/0x250 iopt_area_fill_domains+0x125/0x320 ? rcu_is_watching+0x11/0x50 iopt_map_pages+0x63/0x100 iopt_map_common.isra.0+0xa7/0x190 iopt_map_user_pages+0x6a/0x80 iommufd_ioas_map+0xcd/0x1d0 iommufd_fops_ioctl+0x118/0x1c0 __x64_sys_ioctl+0x93/0xc0 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 705c1cdf1e73 ("iommu/vt-d: Introduce batched cache invalidation") Cc: stable@vger.kernel.org Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241210130322.17175-1-yi.l.liu@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/cache.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index e5b89f728ad3b..09694cca8752d 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -105,12 +105,35 @@ static void cache_tag_unassign(struct dmar_domain *domain, u16 did, spin_unlock_irqrestore(&domain->cache_lock, flags); } +/* domain->qi_batch will be freed in iommu_free_domain() path. */ +static int domain_qi_batch_alloc(struct dmar_domain *domain) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&domain->cache_lock, flags); + if (domain->qi_batch) + goto out_unlock; + + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_ATOMIC); + if (!domain->qi_batch) + ret = -ENOMEM; +out_unlock: + spin_unlock_irqrestore(&domain->cache_lock, flags); + + return ret; +} + static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -139,6 +162,10 @@ static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -190,13 +217,6 @@ int cache_tag_assign_domain(struct dmar_domain *domain, u16 did = domain_get_id_for_dev(domain, dev); int ret; - /* domain->qi_bach will be freed in iommu_free_domain() path. */ - if (!domain->qi_batch) { - domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_KERNEL); - if (!domain->qi_batch) - return -ENOMEM; - } - ret = __cache_tag_assign_domain(domain, did, dev, pasid); if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) return ret; From dda2b8c3c6ccc50deae65cc75f246577348e2ec5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 13 Dec 2024 09:17:52 +0800 Subject: [PATCH 246/330] iommu/vt-d: Avoid draining PRQ in sva mm release path When a PASID is used for SVA by a device, it's possible that the PASID entry is cleared before the device flushes all ongoing DMA requests and removes the SVA domain. This can occur when an exception happens and the process terminates before the device driver stops DMA and calls the iommu driver to unbind the PASID. There's no need to drain the PRQ in the mm release path. Instead, the PRQ will be drained in the SVA unbind path. Unfortunately, commit c43e1ccdebf2 ("iommu/vt-d: Drain PRQs when domain removed from RID") changed this behavior by unconditionally draining the PRQ in intel_pasid_tear_down_entry(). This can lead to a potential sleeping-in-atomic-context issue. Smatch static checker warning: drivers/iommu/intel/prq.c:95 intel_iommu_drain_pasid_prq() warn: sleeping in atomic context To avoid this issue, prevent draining the PRQ in the SVA mm release path and restore the previous behavior. Fixes: c43e1ccdebf2 ("iommu/vt-d: Drain PRQs when domain removed from RID") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-iommu/c5187676-2fa2-4e29-94e0-4a279dc88b49@stanley.mountain/ Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241212021529.1104745-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 0f2a926d3bd59..5b7d85f1e143c 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -265,7 +265,8 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, pasid); - intel_iommu_drain_pasid_prq(dev, pasid); + if (!fault_ignore) + intel_iommu_drain_pasid_prq(dev, pasid); } /* From ce03573a1917532da06057da9f8e74a2ee9e2ac9 Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Wed, 11 Dec 2024 19:16:39 +0800 Subject: [PATCH 247/330] kselftest/arm64: abi: fix SVCR detection When using svcr_in to check ZA and Streaming Mode, we should make sure that the value in x2 is correct, otherwise it may trigger an Illegal instruction if FEAT_SVE and !FEAT_SME. Fixes: 43e3f85523e4 ("kselftest/arm64: Add SME support to syscall ABI test") Signed-off-by: Weizhao Ouyang Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241211111639.12344-1-o451686892@gmail.com Signed-off-by: Catalin Marinas --- .../selftests/arm64/abi/syscall-abi-asm.S | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S index df3230fdac395..66ab2e0bae5fd 100644 --- a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S +++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S @@ -81,32 +81,31 @@ do_syscall: stp x27, x28, [sp, #96] // Set SVCR if we're doing SME - cbz x1, 1f + cbz x1, load_gpr adrp x2, svcr_in ldr x2, [x2, :lo12:svcr_in] msr S3_3_C4_C2_2, x2 -1: // Load ZA and ZT0 if enabled - uses x12 as scratch due to SME LDR - tbz x2, #SVCR_ZA_SHIFT, 1f + tbz x2, #SVCR_ZA_SHIFT, load_gpr mov w12, #0 ldr x2, =za_in -2: _ldr_za 12, 2 +1: _ldr_za 12, 2 add x2, x2, x1 add x12, x12, #1 cmp x1, x12 - bne 2b + bne 1b // ZT0 mrs x2, S3_0_C0_C4_5 // ID_AA64SMFR0_EL1 ubfx x2, x2, #ID_AA64SMFR0_EL1_SMEver_SHIFT, \ #ID_AA64SMFR0_EL1_SMEver_WIDTH - cbz x2, 1f + cbz x2, load_gpr adrp x2, zt_in add x2, x2, :lo12:zt_in _ldr_zt 2 -1: +load_gpr: // Load GPRs x8-x28, and save our SP/FP for later comparison ldr x2, =gpr_in add x2, x2, #64 @@ -125,9 +124,9 @@ do_syscall: str x30, [x2], #8 // LR // Load FPRs if we're not doing neither SVE nor streaming SVE - cbnz x0, 1f + cbnz x0, check_sve_in ldr x2, =svcr_in - tbnz x2, #SVCR_SM_SHIFT, 1f + tbnz x2, #SVCR_SM_SHIFT, check_sve_in ldr x2, =fpr_in ldp q0, q1, [x2] @@ -148,8 +147,8 @@ do_syscall: ldp q30, q31, [x2, #16 * 30] b 2f -1: +check_sve_in: // Load the SVE registers if we're doing SVE/SME ldr x2, =z_in @@ -256,32 +255,31 @@ do_syscall: stp q30, q31, [x2, #16 * 30] // Save SVCR if we're doing SME - cbz x1, 1f + cbz x1, check_sve_out mrs x2, S3_3_C4_C2_2 adrp x3, svcr_out str x2, [x3, :lo12:svcr_out] -1: // Save ZA if it's enabled - uses x12 as scratch due to SME STR - tbz x2, #SVCR_ZA_SHIFT, 1f + tbz x2, #SVCR_ZA_SHIFT, check_sve_out mov w12, #0 ldr x2, =za_out -2: _str_za 12, 2 +1: _str_za 12, 2 add x2, x2, x1 add x12, x12, #1 cmp x1, x12 - bne 2b + bne 1b // ZT0 mrs x2, S3_0_C0_C4_5 // ID_AA64SMFR0_EL1 ubfx x2, x2, #ID_AA64SMFR0_EL1_SMEver_SHIFT, \ #ID_AA64SMFR0_EL1_SMEver_WIDTH - cbz x2, 1f + cbz x2, check_sve_out adrp x2, zt_out add x2, x2, :lo12:zt_out _str_zt 2 -1: +check_sve_out: // Save the SVE state if we have some cbz x0, 1f From e01424fab35d77439956ea98c915c639fbf16db0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:29:39 -0800 Subject: [PATCH 248/330] mq-deadline: Remove a local variable Since commit fde02699c242 ("block: mq-deadline: Remove support for zone write locking"), the local variable 'insert_before' is assigned once and is used once. Hence remove this local variable. Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Nitesh Shetty Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241212212941.1268662-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 91b3789f710e7..5528347b5fcfc 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -698,8 +698,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { - struct list_head *insert_before; - deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { @@ -712,8 +710,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - insert_before = &per_prio->fifo_list[data_dir]; - list_add_tail(&rq->queuelist, insert_before); + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } From 312ccd4b755a09dc44e8a25f9c9526a4587ab53c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:29:40 -0800 Subject: [PATCH 249/330] blk-mq: Clean up blk_mq_requeue_work() Move a statement that occurs in both branches of an if-statement in front of the if-statement. Fix a typo in a source code comment. No functionality has been changed. Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Nitesh Shetty Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241212212941.1268662-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index aa340b097b6e4..92e8ddf34575d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1544,19 +1544,17 @@ static void blk_mq_requeue_work(struct work_struct *work) while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); /* - * If RQF_DONTPREP ist set, the request has been started by the + * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) { - list_del_init(&rq->queuelist); + if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); - } else { - list_del_init(&rq->queuelist); + else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); - } } while (!list_empty(&flush_list)) { From a6fe7b70513fbf11ffa5e85f7b6ba444497a5a3d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:29:41 -0800 Subject: [PATCH 250/330] block: Fix queue_iostats_passthrough_show() Make queue_iostats_passthrough_show() report 0/1 in sysfs instead of 0/4. This patch fixes the following sparse warning: block/blk-sysfs.c:266:31: warning: incorrect type in argument 1 (different base types) block/blk-sysfs.c:266:31: expected unsigned long var block/blk-sysfs.c:266:31: got restricted blk_flags_t Cc: Keith Busch Cc: Christoph Hellwig Fixes: 110234da18ab ("block: enable passthrough command statistics") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241212212941.1268662-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4241aea84161c..767598e719ab0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -263,7 +263,7 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { - return queue_var_show(blk_queue_passthrough_stat(disk->queue), page); + return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, From a1855f1b7c33642c9f7a01991fb763342a312e9b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 13 Dec 2024 15:57:53 +0100 Subject: [PATCH 251/330] irqchip/gic: Correct declaration of *percpu_base pointer in union gic_base percpu_base is used in various percpu functions that expect variable in __percpu address space. Correct the declaration of percpu_base to void __iomem * __percpu *percpu_base; to declare the variable as __percpu pointer. The patch fixes several sparse warnings: irq-gic.c:1172:44: warning: incorrect type in assignment (different address spaces) irq-gic.c:1172:44: expected void [noderef] __percpu *[noderef] __iomem *percpu_base irq-gic.c:1172:44: got void [noderef] __iomem *[noderef] __percpu * ... irq-gic.c:1231:43: warning: incorrect type in argument 1 (different address spaces) irq-gic.c:1231:43: expected void [noderef] __percpu *__pdata irq-gic.c:1231:43: got void [noderef] __percpu *[noderef] __iomem *percpu_base There were no changes in the resulting object files. Signed-off-by: Uros Bizjak Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/all/20241213145809.2918-2-ubizjak@gmail.com --- drivers/irqchip/irq-gic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 8fae6dc010241..6503573557fdf 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -64,7 +64,7 @@ static void gic_check_cpu_features(void) union gic_base { void __iomem *common_base; - void __percpu * __iomem *percpu_base; + void __iomem * __percpu *percpu_base; }; struct gic_chip_data { From 773c05f417fa14e1ac94776619e9c978ec001f0b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Dec 2024 14:10:37 +0000 Subject: [PATCH 252/330] irqchip/gic-v3: Work around insecure GIC integrations It appears that the relatively popular RK3399 SoC has been put together using a large amount of illicit substances, as experiments reveal that its integration of GIC500 exposes the *secure* programming interface to non-secure. This has some pretty bad effects on the way priorities are handled, and results in a dead machine if booting with pseudo-NMI enabled (irqchip.gicv3_pseudo_nmi=1) if the kernel contains 18fdb6348c480 ("arm64: irqchip/gic-v3: Select priorities at boot time"), which relies on the priorities being programmed using the NS view. Let's restore some sanity by going one step further and disable security altogether in this case. This is not any worse, and puts us in a mode where priorities actually make some sense. Huge thanks to Mark Kettenis who initially identified this issue on OpenBSD, and to Chen-Yu Tsai who reported the problem in Linux. Fixes: 18fdb6348c480 ("arm64: irqchip/gic-v3: Select priorities at boot time") Reported-by: Mark Kettenis Reported-by: Chen-Yu Tsai Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Chen-Yu Tsai Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241213141037.3995049-1-maz@kernel.org --- drivers/irqchip/irq-gic-v3.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 34db379d066a5..79d8cc80693c3 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -161,7 +161,22 @@ static bool cpus_have_group0 __ro_after_init; static void __init gic_prio_init(void) { - cpus_have_security_disabled = gic_dist_security_disabled(); + bool ds; + + ds = gic_dist_security_disabled(); + if (!ds) { + u32 val; + + val = readl_relaxed(gic_data.dist_base + GICD_CTLR); + val |= GICD_CTLR_DS; + writel_relaxed(val, gic_data.dist_base + GICD_CTLR); + + ds = gic_dist_security_disabled(); + if (ds) + pr_warn("Broken GIC integration, security disabled"); + } + + cpus_have_security_disabled = ds; cpus_have_group0 = gic_has_group0(); /* From be26ba96421ab0a8fa2055ccf7db7832a13c44d2 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 10 Dec 2024 20:11:43 +0530 Subject: [PATCH 253/330] block: Fix potential deadlock while freezing queue and acquiring sysfs_lock For storing a value to a queue attribute, the queue_attr_store function first freezes the queue (->q_usage_counter(io)) and then acquire ->sysfs_lock. This seems not correct as the usual ordering should be to acquire ->sysfs_lock before freezing the queue. This incorrect ordering causes the following lockdep splat which we are able to reproduce always simply by accessing /sys/kernel/debug file using ls command: [ 57.597146] WARNING: possible circular locking dependency detected [ 57.597154] 6.12.0-10553-gb86545e02e8c #20 Tainted: G W [ 57.597162] ------------------------------------------------------ [ 57.597168] ls/4605 is trying to acquire lock: [ 57.597176] c00000003eb56710 (&mm->mmap_lock){++++}-{4:4}, at: __might_fault+0x58/0xc0 [ 57.597200] but task is already holding lock: [ 57.597207] c0000018e27c6810 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: iterate_dir+0x94/0x1d4 [ 57.597226] which lock already depends on the new lock. [ 57.597233] the existing dependency chain (in reverse order) is: [ 57.597241] -> #5 (&sb->s_type->i_mutex_key#3){++++}-{4:4}: [ 57.597255] down_write+0x6c/0x18c [ 57.597264] start_creating+0xb4/0x24c [ 57.597274] debugfs_create_dir+0x2c/0x1e8 [ 57.597283] blk_register_queue+0xec/0x294 [ 57.597292] add_disk_fwnode+0x2e4/0x548 [ 57.597302] brd_alloc+0x2c8/0x338 [ 57.597309] brd_init+0x100/0x178 [ 57.597317] do_one_initcall+0x88/0x3e4 [ 57.597326] kernel_init_freeable+0x3cc/0x6e0 [ 57.597334] kernel_init+0x34/0x1cc [ 57.597342] ret_from_kernel_user_thread+0x14/0x1c [ 57.597350] -> #4 (&q->debugfs_mutex){+.+.}-{4:4}: [ 57.597362] __mutex_lock+0xfc/0x12a0 [ 57.597370] blk_register_queue+0xd4/0x294 [ 57.597379] add_disk_fwnode+0x2e4/0x548 [ 57.597388] brd_alloc+0x2c8/0x338 [ 57.597395] brd_init+0x100/0x178 [ 57.597402] do_one_initcall+0x88/0x3e4 [ 57.597410] kernel_init_freeable+0x3cc/0x6e0 [ 57.597418] kernel_init+0x34/0x1cc [ 57.597426] ret_from_kernel_user_thread+0x14/0x1c [ 57.597434] -> #3 (&q->sysfs_lock){+.+.}-{4:4}: [ 57.597446] __mutex_lock+0xfc/0x12a0 [ 57.597454] queue_attr_store+0x9c/0x110 [ 57.597462] sysfs_kf_write+0x70/0xb0 [ 57.597471] kernfs_fop_write_iter+0x1b0/0x2ac [ 57.597480] vfs_write+0x3dc/0x6e8 [ 57.597488] ksys_write+0x84/0x140 [ 57.597495] system_call_exception+0x130/0x360 [ 57.597504] system_call_common+0x160/0x2c4 [ 57.597516] -> #2 (&q->q_usage_counter(io)#21){++++}-{0:0}: [ 57.597530] __submit_bio+0x5ec/0x828 [ 57.597538] submit_bio_noacct_nocheck+0x1e4/0x4f0 [ 57.597547] iomap_readahead+0x2a0/0x448 [ 57.597556] xfs_vm_readahead+0x28/0x3c [ 57.597564] read_pages+0x88/0x41c [ 57.597571] page_cache_ra_unbounded+0x1ac/0x2d8 [ 57.597580] filemap_get_pages+0x188/0x984 [ 57.597588] filemap_read+0x13c/0x4bc [ 57.597596] xfs_file_buffered_read+0x88/0x17c [ 57.597605] xfs_file_read_iter+0xac/0x158 [ 57.597614] vfs_read+0x2d4/0x3b4 [ 57.597622] ksys_read+0x84/0x144 [ 57.597629] system_call_exception+0x130/0x360 [ 57.597637] system_call_common+0x160/0x2c4 [ 57.597647] -> #1 (mapping.invalidate_lock#2){++++}-{4:4}: [ 57.597661] down_read+0x6c/0x220 [ 57.597669] filemap_fault+0x870/0x100c [ 57.597677] xfs_filemap_fault+0xc4/0x18c [ 57.597684] __do_fault+0x64/0x164 [ 57.597693] __handle_mm_fault+0x1274/0x1dac [ 57.597702] handle_mm_fault+0x248/0x484 [ 57.597711] ___do_page_fault+0x428/0xc0c [ 57.597719] hash__do_page_fault+0x30/0x68 [ 57.597727] do_hash_fault+0x90/0x35c [ 57.597736] data_access_common_virt+0x210/0x220 [ 57.597745] _copy_from_user+0xf8/0x19c [ 57.597754] sel_write_load+0x178/0xd54 [ 57.597762] vfs_write+0x108/0x6e8 [ 57.597769] ksys_write+0x84/0x140 [ 57.597777] system_call_exception+0x130/0x360 [ 57.597785] system_call_common+0x160/0x2c4 [ 57.597794] -> #0 (&mm->mmap_lock){++++}-{4:4}: [ 57.597806] __lock_acquire+0x17cc/0x2330 [ 57.597814] lock_acquire+0x138/0x400 [ 57.597822] __might_fault+0x7c/0xc0 [ 57.597830] filldir64+0xe8/0x390 [ 57.597839] dcache_readdir+0x80/0x2d4 [ 57.597846] iterate_dir+0xd8/0x1d4 [ 57.597855] sys_getdents64+0x88/0x2d4 [ 57.597864] system_call_exception+0x130/0x360 [ 57.597872] system_call_common+0x160/0x2c4 [ 57.597881] other info that might help us debug this: [ 57.597888] Chain exists of: &mm->mmap_lock --> &q->debugfs_mutex --> &sb->s_type->i_mutex_key#3 [ 57.597905] Possible unsafe locking scenario: [ 57.597911] CPU0 CPU1 [ 57.597917] ---- ---- [ 57.597922] rlock(&sb->s_type->i_mutex_key#3); [ 57.597932] lock(&q->debugfs_mutex); [ 57.597940] lock(&sb->s_type->i_mutex_key#3); [ 57.597950] rlock(&mm->mmap_lock); [ 57.597958] *** DEADLOCK *** [ 57.597965] 2 locks held by ls/4605: [ 57.597971] #0: c0000000137c12f8 (&f->f_pos_lock){+.+.}-{4:4}, at: fdget_pos+0xcc/0x154 [ 57.597989] #1: c0000018e27c6810 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: iterate_dir+0x94/0x1d4 Prevent the above lockdep warning by acquiring ->sysfs_lock before freezing the queue while storing a queue attribute in queue_attr_store function. Later, we also found[1] another function __blk_mq_update_nr_ hw_queues where we first freeze queue and then acquire the ->sysfs_lock. So we've also updated lock ordering in __blk_mq_update_nr_hw_queues function and ensured that in all code paths we follow the correct lock ordering i.e. acquire ->sysfs_lock before freezing the queue. [1] https://lore.kernel.org/all/CAFj5m9Ke8+EHKQBs_Nk6hqd=LGXtk4mUxZUN5==ZcCjnZSBwHw@mail.gmail.com/ Reported-by: kjain@linux.ibm.com Fixes: af2814149883 ("block: freeze the queue in queue_attr_store") Tested-by: kjain@linux.ibm.com Cc: hch@lst.de Cc: axboe@kernel.dk Cc: ritesh.list@gmail.com Cc: ming.lei@redhat.com Cc: gjoyce@linux.ibm.com Signed-off-by: Nilay Shroff Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241210144222.1066229-1-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 16 ++++++---------- block/blk-mq.c | 29 ++++++++++++++++++----------- block/blk-sysfs.c | 4 ++-- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 156e9bb07abf1..cd5ea6eaa76b0 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -275,15 +275,13 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - mutex_lock(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_dir_lock); + if (!q->mq_sysfs_init_done) - goto unlock; + return; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); - -unlock: - mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) @@ -292,9 +290,10 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) unsigned long i; int ret = 0; - mutex_lock(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_dir_lock); + if (!q->mq_sysfs_init_done) - goto unlock; + return ret; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -302,8 +301,5 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) break; } -unlock: - mutex_unlock(&q->sysfs_dir_lock); - return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 92e8ddf34575d..6b6111513986f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4453,7 +4453,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_lock); + for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4486,7 +4487,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4518,10 +4518,14 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); + mutex_lock(&q->sysfs_lock); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; + mutex_unlock(&q->sysfs_lock); + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -4540,6 +4544,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: + mutex_unlock(&q->sysfs_lock); blk_mq_release(q); err_exit: q->mq_ops = NULL; @@ -4920,12 +4925,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head, return false; /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_lock); /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); - goto unlock; + goto out; } INIT_LIST_HEAD(&qe->node); @@ -4935,9 +4940,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); -unlock: - mutex_unlock(&q->sysfs_lock); - +out: return true; } @@ -4966,11 +4969,9 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); - mutex_lock(&q->sysfs_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); - mutex_unlock(&q->sysfs_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -4990,8 +4991,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) + list_for_each_entry(q, &set->tag_list, tag_set_list) { + mutex_lock(&q->sysfs_dir_lock); + mutex_lock(&q->sysfs_lock); blk_mq_freeze_queue(q); + } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -5047,8 +5051,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); - list_for_each_entry(q, &set->tag_list, tag_set_list) + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_unfreeze_queue(q); + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); + } /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 767598e719ab0..64f70c713d2f9 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -706,11 +706,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - blk_mq_freeze_queue(q); mutex_lock(&q->sysfs_lock); + blk_mq_freeze_queue(q); res = entry->store(disk, page, length); - mutex_unlock(&q->sysfs_lock); blk_mq_unfreeze_queue(q); + mutex_unlock(&q->sysfs_lock); return res; } From 1201f226c863b7da739f7420ddba818cedf372fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:32:58 -0800 Subject: [PATCH 254/330] KVM: x86: Cache CPUID.0xD XSTATE offsets+sizes during module init Snapshot the output of CPUID.0xD.[1..n] during kvm.ko initiliaization to avoid the overead of CPUID during runtime. The offset, size, and metadata for CPUID.0xD.[1..n] sub-leaves does not depend on XCR0 or XSS values, i.e. is constant for a given CPU, and thus can be cached during module load. On Intel's Emerald Rapids, CPUID is *wildly* expensive, to the point where recomputing XSAVE offsets and sizes results in a 4x increase in latency of nested VM-Enter and VM-Exit (nested transitions can trigger xstate_required_size() multiple times per transition), relative to using cached values. The issue is easily visible by running `perf top` while triggering nested transitions: kvm_update_cpuid_runtime() shows up at a whopping 50%. As measured via RDTSC from L2 (using KVM-Unit-Test's CPUID VM-Exit test and a slightly modified L1 KVM to handle CPUID in the fastpath), a nested roundtrip to emulate CPUID on Skylake (SKX), Icelake (ICX), and Emerald Rapids (EMR) takes: SKX 11650 ICX 22350 EMR 28850 Using cached values, the latency drops to: SKX 6850 ICX 9000 EMR 7900 The underlying issue is that CPUID itself is slow on ICX, and comically slow on EMR. The problem is exacerbated on CPUs which support XSAVES and/or XSAVEC, as KVM invokes xstate_required_size() twice on each runtime CPUID update, and because there are more supported XSAVE features (CPUID for supported XSAVE feature sub-leafs is significantly slower). SKX: CPUID.0xD.2 = 348 cycles CPUID.0xD.3 = 400 cycles CPUID.0xD.4 = 276 cycles CPUID.0xD.5 = 236 cycles EMR: CPUID.0xD.2 = 1138 cycles CPUID.0xD.3 = 1362 cycles CPUID.0xD.4 = 1068 cycles CPUID.0xD.5 = 910 cycles CPUID.0xD.6 = 914 cycles CPUID.0xD.7 = 1350 cycles CPUID.0xD.8 = 734 cycles CPUID.0xD.9 = 766 cycles CPUID.0xD.10 = 732 cycles CPUID.0xD.11 = 718 cycles CPUID.0xD.12 = 734 cycles CPUID.0xD.13 = 1700 cycles CPUID.0xD.14 = 1126 cycles CPUID.0xD.15 = 898 cycles CPUID.0xD.16 = 716 cycles CPUID.0xD.17 = 748 cycles CPUID.0xD.18 = 776 cycles Note, updating runtime CPUID information multiple times per nested transition is itself a flaw, especially since CPUID is a mandotory intercept on both Intel and AMD. E.g. KVM doesn't need to ensure emulated CPUID state is up-to-date while running L2. That flaw will be fixed in a future patch, as deferring runtime CPUID updates is more subtle than it appears at first glance, the benefits aren't super critical to have once the XSAVE issue is resolved, and caching CPUID output is desirable even if KVM's updates are deferred. Cc: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20241211013302.1347853-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 31 ++++++++++++++++++++++++++----- arch/x86/kvm/cpuid.h | 1 + arch/x86/kvm/x86.c | 2 ++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 097bdc022d0f4..ae0b438a2c991 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,26 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); +struct cpuid_xstate_sizes { + u32 eax; + u32 ebx; + u32 ecx; +}; + +static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init; + +void __init kvm_init_xstate_sizes(void) +{ + u32 ign; + int i; + + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + + cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign); + } +} + u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; @@ -44,14 +64,15 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx, offset; - cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; + u32 offset; + /* ECX[1]: 64B alignment in compacted form */ if (compacted) - offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; else - offset = ebx; - ret = max(ret, offset + eax); + offset = xs->ebx; + ret = max(ret, offset + xs->eax); } xstate_bv >>= 1; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c8dc66eddefda..f16a7b2c2adcf 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -31,6 +31,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a1..c8160baf38385 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13997,6 +13997,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { + kvm_init_xstate_sizes(); + kvm_mmu_x86_module_init(); mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; From 824927e88456331c7a999fdf5d9d27923b619590 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Dec 2024 14:37:15 +0200 Subject: [PATCH 255/330] ARC: build: Try to guess GCC variant of cross compiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ARC GCC compiler is packaged starting from Fedora 39i and the GCC variant of cross compile tools has arc-linux-gnu- prefix and not arc-linux-. This is causing that CROSS_COMPILE variable is left unset. This change allows builds without need to supply CROSS_COMPILE argument if distro package is used. Before this change: $ make -j 128 ARCH=arc W=1 drivers/infiniband/hw/mlx4/ gcc: warning: ‘-mcpu=’ is deprecated; use ‘-mtune=’ or ‘-march=’ instead gcc: error: unrecognized command-line option ‘-mmedium-calls’ gcc: error: unrecognized command-line option ‘-mlock’ gcc: error: unrecognized command-line option ‘-munaligned-access’ [1] https://packages.fedoraproject.org/pkgs/cross-gcc/gcc-arc-linux-gnu/index.html Signed-off-by: Leon Romanovsky Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 2390dd042e363..fb98478ed1ab0 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -6,7 +6,7 @@ KBUILD_DEFCONFIG := haps_hs_smp_defconfig ifeq ($(CROSS_COMPILE),) -CROSS_COMPILE := $(call cc-cross-prefix, arc-linux- arceb-linux-) +CROSS_COMPILE := $(call cc-cross-prefix, arc-linux- arceb-linux- arc-linux-gnu-) endif cflags-y += -fno-common -pipe -fno-builtin -mmedium-calls -D__linux__ From c00d738e1673ab801e1577e4e3c780ccf88b1a5b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:27 -0800 Subject: [PATCH 256/330] bpf: Revert "bpf: Mark raw_tp arguments with PTR_MAYBE_NULL" This patch reverts commit cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL"). The patch was well-intended and meant to be as a stop-gap fixing branch prediction when the pointer may actually be NULL at runtime. Eventually, it was supposed to be replaced by an automated script or compiler pass detecting possibly NULL arguments and marking them accordingly. However, it caused two main issues observed for production programs and failed to preserve backwards compatibility. First, programs relied on the verifier not exploring == NULL branch when pointer is not NULL, thus they started failing with a 'dereference of scalar' error. Next, allowing raw_tp arguments to be modified surfaced the warning in the verifier that warns against reg->off when PTR_MAYBE_NULL is set. More information, context, and discusson on both problems is available in [0]. Overall, this approach had several shortcomings, and the fixes would further complicate the verifier's logic, and the entire masking scheme would have to be removed eventually anyway. Hence, revert the patch in preparation of a better fix avoiding these issues to replace this commit. [0]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com Reported-by: Manu Bretelle Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 -- kernel/bpf/btf.c | 5 +- kernel/bpf/verifier.c | 79 ++----------------- .../bpf/progs/test_tp_btf_nullable.c | 6 +- 4 files changed, 9 insertions(+), 87 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 805040813f5d5..6e63dd3443b96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3514,10 +3514,4 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } -static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog) -{ - return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_RAW_TP; -} - #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a63a03582f02d..c4aa304028cea 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6594,10 +6594,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; - /* Raw tracepoint arguments always get marked as maybe NULL */ - if (bpf_prog_is_raw_tp(prog)) - info->reg_type |= PTR_MAYBE_NULL; - else if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (tgt_prog) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e541339b2f6d..f7f892a52a374 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -420,25 +420,6 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } -static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) && - bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id; -} - -static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - if (!mask_raw_tp_reg_cond(env, reg)) - return false; - reg->type &= ~PTR_MAYBE_NULL; - return true; -} - -static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result) -{ - if (result) - reg->type |= PTR_MAYBE_NULL; -} - static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; @@ -6801,7 +6782,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; - bool mask; int ret; if (!env->allow_ptr_leaks) { @@ -6873,21 +6853,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL - * trusted PTR_TO_BTF_ID, these are the ones that are possibly - * arguments to the raw_tp. Since internal checks in for trusted - * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL - * modifier as problematic, mask it out temporarily for the - * check. Don't apply this to pointers with ref_obj_id > 0, as - * those won't be raw_tp args. - * - * We may end up applying this relaxation to other trusted - * PTR_TO_BTF_ID with maybe null flag, since we cannot - * distinguish PTR_MAYBE_NULL tagged for arguments vs normal - * tagging, but that should expand allowed behavior, and not - * cause regression for existing behavior. - */ - mask = mask_raw_tp_reg(env, reg); + if (ret != PTR_TO_BTF_ID) { /* just mark; */ @@ -6948,13 +6914,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) { + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); - /* We've assigned a new type to regno, so don't undo masking. */ - if (regno == value_regno) - mask = false; - } - unmask_raw_tp_reg(reg, mask); return 0; } @@ -7329,7 +7290,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && - (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) { + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -9032,7 +8993,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; int err = 0; - bool mask; if (arg_type == ARG_DONTCARE) return 0; @@ -9073,11 +9033,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + if (err) + return err; - err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; @@ -9872,17 +9832,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; - bool mask; int err; if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); - unmask_raw_tp_reg(reg, mask); if (err) return err; } else { @@ -12205,7 +12162,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; - bool mask = false; int kf_arg_type; t = btf_type_skip_modifiers(btf, args[i].type, NULL); @@ -12264,15 +12220,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - mask = mask_raw_tp_reg(env, reg); if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); - unmask_raw_tp_reg(reg, mask); return -EACCES; } - unmask_raw_tp_reg(reg, mask); if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { @@ -12330,24 +12283,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; - /* Allow passing maybe NULL raw_tp arguments to - * kfuncs for compatibility. Don't apply this to - * arguments with ref_obj_id > 0. - */ - mask = mask_raw_tp_reg(env, reg); if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } if (!is_rcu_reg(reg)) { verbose(env, "R%d must be a rcu pointer\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } } - unmask_raw_tp_reg(reg, mask); fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: @@ -12370,9 +12315,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; - mask = mask_raw_tp_reg(env, reg); ret = check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; @@ -12549,7 +12492,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_tname = btf_name_by_offset(btf, ref_t->name_off); fallthrough; case KF_ARG_PTR_TO_BTF_ID: - mask = mask_raw_tp_reg(env, reg); /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && @@ -12558,11 +12500,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; break; @@ -13535,7 +13475,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, */ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { struct bpf_verifier_state *vstate = env->cur_state; @@ -13549,7 +13489,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - bool mask; int ret; dst_reg = ®s[dst]; @@ -13576,14 +13515,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - mask = mask_raw_tp_reg(env, ptr_reg); if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str(env, ptr_reg->type)); - unmask_raw_tp_reg(ptr_reg, mask); return -EACCES; } - unmask_raw_tp_reg(ptr_reg, mask); switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: @@ -20126,7 +20062,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: - case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c index 5aaf2b065f86c..bba3e37f749b8 100644 --- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -7,11 +7,7 @@ #include "bpf_misc.h" SEC("tp_btf/bpf_testmod_test_nullable_bare") -/* This used to be a failure test, but raw_tp nullable arguments can now - * directly be dereferenced, whether they have nullable annotation or not, - * and don't need to be explicitly checked. - */ -__success +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) { return nullable_ctx->len; From 838a10bd2ebfe11a60dd67687533a7cfc220cc86 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:28 -0800 Subject: [PATCH 257/330] bpf: Augment raw_tp arguments with PTR_MAYBE_NULL Arguments to a raw tracepoint are tagged as trusted, which carries the semantics that the pointer will be non-NULL. However, in certain cases, a raw tracepoint argument may end up being NULL. More context about this issue is available in [0]. Thus, there is a discrepancy between the reality, that raw_tp arguments can actually be NULL, and the verifier's knowledge, that they are never NULL, causing explicit NULL check branch to be dead code eliminated. A previous attempt [1], i.e. the second fixed commit, was made to simulate symbolic execution as if in most accesses, the argument is a non-NULL raw_tp, except for conditional jumps. This tried to suppress branch prediction while preserving compatibility, but surfaced issues with production programs that were difficult to solve without increasing verifier complexity. A more complete discussion of issues and fixes is available at [2]. Fix this by maintaining an explicit list of tracepoints where the arguments are known to be NULL, and mark the positional arguments as PTR_MAYBE_NULL. Additionally, capture the tracepoints where arguments are known to be ERR_PTR, and mark these arguments as scalar values to prevent potential dereference. Each hex digit is used to encode NULL-ness (0x1) or ERR_PTR-ness (0x2), shifted by the zero-indexed argument number x 4. This can be represented as follows: 1st arg: 0x1 2nd arg: 0x10 3rd arg: 0x100 ... and so on (likewise for ERR_PTR case). In the future, an automated pass will be used to produce such a list, or insert __nullable annotations automatically for tracepoints. Each compilation unit will be analyzed and results will be collated to find whether a tracepoint pointer is definitely not null, maybe null, or an unknown state where verifier conservatively marks it PTR_MAYBE_NULL. A proof of concept of this tool from Eduard is available at [3]. Note that in case we don't find a specification in the raw_tp_null_args array and the tracepoint belongs to a kernel module, we will conservatively mark the arguments as PTR_MAYBE_NULL. This is because unlike for in-tree modules, out-of-tree module tracepoints may pass NULL freely to the tracepoint. We don't protect against such tracepoints passing ERR_PTR (which is uncommon anyway), lest we mark all such arguments as SCALAR_VALUE. While we are it, let's adjust the test raw_tp_null to not perform dereference of the skb->mark, as that won't be allowed anymore, and make it more robust by using inline assembly to test the dead code elimination behavior, which should still stay the same. [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb [1]: https://lore.kernel.org/all/20241104171959.2938862-1-memxor@gmail.com [2]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com [3]: https://github.com/eddyz87/llvm-project/tree/nullness-for-tracepoint-params Reported-by: Juri Lelli # original bug Reported-by: Manu Bretelle # bugs in masking fix Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL") Reviewed-by: Eduard Zingerman Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 138 ++++++++++++++++++ .../testing/selftests/bpf/progs/raw_tp_null.c | 19 ++- 2 files changed, 147 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c4aa304028cea..e5a5f023cedd5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6439,6 +6439,101 @@ int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, return off; } +struct bpf_raw_tp_null_args { + const char *func; + u64 mask; +}; + +static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { + /* sched */ + { "sched_pi_setprio", 0x10 }, + /* ... from sched_numa_pair_template event class */ + { "sched_stick_numa", 0x100 }, + { "sched_swap_numa", 0x100 }, + /* afs */ + { "afs_make_fs_call", 0x10 }, + { "afs_make_fs_calli", 0x10 }, + { "afs_make_fs_call1", 0x10 }, + { "afs_make_fs_call2", 0x10 }, + { "afs_protocol_error", 0x1 }, + { "afs_flock_ev", 0x10 }, + /* cachefiles */ + { "cachefiles_lookup", 0x1 | 0x200 }, + { "cachefiles_unlink", 0x1 }, + { "cachefiles_rename", 0x1 }, + { "cachefiles_prep_read", 0x1 }, + { "cachefiles_mark_active", 0x1 }, + { "cachefiles_mark_failed", 0x1 }, + { "cachefiles_mark_inactive", 0x1 }, + { "cachefiles_vfs_error", 0x1 }, + { "cachefiles_io_error", 0x1 }, + { "cachefiles_ondemand_open", 0x1 }, + { "cachefiles_ondemand_copen", 0x1 }, + { "cachefiles_ondemand_close", 0x1 }, + { "cachefiles_ondemand_read", 0x1 }, + { "cachefiles_ondemand_cread", 0x1 }, + { "cachefiles_ondemand_fd_write", 0x1 }, + { "cachefiles_ondemand_fd_release", 0x1 }, + /* ext4, from ext4__mballoc event class */ + { "ext4_mballoc_discard", 0x10 }, + { "ext4_mballoc_free", 0x10 }, + /* fib */ + { "fib_table_lookup", 0x100 }, + /* filelock */ + /* ... from filelock_lock event class */ + { "posix_lock_inode", 0x10 }, + { "fcntl_setlk", 0x10 }, + { "locks_remove_posix", 0x10 }, + { "flock_lock_inode", 0x10 }, + /* ... from filelock_lease event class */ + { "break_lease_noblock", 0x10 }, + { "break_lease_block", 0x10 }, + { "break_lease_unblock", 0x10 }, + { "generic_delete_lease", 0x10 }, + { "time_out_leases", 0x10 }, + /* host1x */ + { "host1x_cdma_push_gather", 0x10000 }, + /* huge_memory */ + { "mm_khugepaged_scan_pmd", 0x10 }, + { "mm_collapse_huge_page_isolate", 0x1 }, + { "mm_khugepaged_scan_file", 0x10 }, + { "mm_khugepaged_collapse_file", 0x10 }, + /* kmem */ + { "mm_page_alloc", 0x1 }, + { "mm_page_pcpu_drain", 0x1 }, + /* .. from mm_page event class */ + { "mm_page_alloc_zone_locked", 0x1 }, + /* netfs */ + { "netfs_failure", 0x10 }, + /* power */ + { "device_pm_callback_start", 0x10 }, + /* qdisc */ + { "qdisc_dequeue", 0x1000 }, + /* rxrpc */ + { "rxrpc_recvdata", 0x1 }, + { "rxrpc_resend", 0x10 }, + /* sunrpc */ + { "xs_stream_read_data", 0x1 }, + /* ... from xprt_cong_event event class */ + { "xprt_reserve_cong", 0x10 }, + { "xprt_release_cong", 0x10 }, + { "xprt_get_cong", 0x10 }, + { "xprt_put_cong", 0x10 }, + /* tcp */ + { "tcp_send_reset", 0x11 }, + /* tegra_apb_dma */ + { "tegra_dma_tx_status", 0x100 }, + /* timer_migration */ + { "tmigr_update_events", 0x1 }, + /* writeback, from writeback_folio_template event class */ + { "writeback_dirty_folio", 0x10 }, + { "folio_wait_writeback", 0x10 }, + /* rdma */ + { "mr_integ_alloc", 0x2000 }, + /* bpf_testmod */ + { "bpf_testmod_test_read", 0x0 }, +}; + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6449,6 +6544,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + bool ptr_err_raw_tp = false; const char *tag_value; u32 nr_args, arg; int i, ret; @@ -6597,6 +6693,39 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + struct btf *btf = prog->aux->attach_btf; + const struct btf_type *t; + const char *tname; + + /* BTF lookups cannot fail, return false on error */ + t = btf_type_by_id(btf, prog->aux->attach_btf_id); + if (!t) + return false; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return false; + /* Checked by bpf_check_attach_target */ + tname += sizeof("btf_trace_") - 1; + for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) { + /* Is this a func with potential NULL args? */ + if (strcmp(tname, raw_tp_null_args[i].func)) + continue; + if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + info->reg_type |= PTR_MAYBE_NULL; + /* Is the current arg IS_ERR? */ + if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + ptr_err_raw_tp = true; + break; + } + /* If we don't know NULL-ness specification and the tracepoint + * is coming from a loadable module, be conservative and mark + * argument as PTR_MAYBE_NULL. + */ + if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf)) + info->reg_type |= PTR_MAYBE_NULL; + } + if (tgt_prog) { enum bpf_prog_type tgt_type; @@ -6641,6 +6770,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); + + /* Perform all checks on the validity of type for this argument, but if + * we know it can be IS_ERR at runtime, scrub pointer type and mark as + * scalar. + */ + if (ptr_err_raw_tp) { + bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg); + info->reg_type = SCALAR_VALUE; + } return true; } EXPORT_SYMBOL_GPL(btf_ctx_access); diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null.c b/tools/testing/selftests/bpf/progs/raw_tp_null.c index 457f34c151e32..5927054b6dd96 100644 --- a/tools/testing/selftests/bpf/progs/raw_tp_null.c +++ b/tools/testing/selftests/bpf/progs/raw_tp_null.c @@ -3,6 +3,7 @@ #include #include +#include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -17,16 +18,14 @@ int BPF_PROG(test_raw_tp_null, struct sk_buff *skb) if (task->pid != tid) return 0; - i = i + skb->mark + 1; - /* The compiler may move the NULL check before this deref, which causes - * the load to fail as deref of scalar. Prevent that by using a barrier. + /* If dead code elimination kicks in, the increment +=2 will be + * removed. For raw_tp programs attaching to tracepoints in kernel + * modules, we mark input arguments as PTR_MAYBE_NULL, so branch + * prediction should never kick in. */ - barrier(); - /* If dead code elimination kicks in, the increment below will - * be removed. For raw_tp programs, we mark input arguments as - * PTR_MAYBE_NULL, so branch prediction should never kick in. - */ - if (!skb) - i += 2; + asm volatile ("%[i] += 1; if %[ctx] != 0 goto +1; %[i] += 2;" + : [i]"+r"(i) + : [ctx]"r"(skb) + : "memory"); return 0; } From 0da1955b5bd2af3a1c3d13916df06e34ffa6df3d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:29 -0800 Subject: [PATCH 258/330] selftests/bpf: Add tests for raw_tp NULL args Add tests to ensure that arguments are correctly marked based on their specified positions, and whether they get marked correctly as maybe null. For modules, all tracepoint parameters should be marked PTR_MAYBE_NULL by default. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/raw_tp_null.c | 3 +++ .../selftests/bpf/progs/raw_tp_null_fail.c | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/raw_tp_null_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c index 6fa19449297e9..43676a9922dce 100644 --- a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c @@ -3,11 +3,14 @@ #include #include "raw_tp_null.skel.h" +#include "raw_tp_null_fail.skel.h" void test_raw_tp_null(void) { struct raw_tp_null *skel; + RUN_TESTS(raw_tp_null_fail); + skel = raw_tp_null__open_and_load(); if (!ASSERT_OK_PTR(skel, "raw_tp_null__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c new file mode 100644 index 0000000000000..38d669957bf1b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Ensure module parameter has PTR_MAYBE_NULL */ +SEC("tp_btf/bpf_testmod_test_raw_tp_null") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int test_raw_tp_null_bpf_testmod_test_raw_tp_null_arg_1(void *ctx) { + asm volatile("r1 = *(u64 *)(r1 +0); r1 = *(u64 *)(r1 +0);" ::: __clobber_all); + return 0; +} + +/* Check NULL marking */ +SEC("tp_btf/sched_pi_setprio") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int test_raw_tp_null_sched_pi_setprio_arg_2(void *ctx) { + asm volatile("r1 = *(u64 *)(r1 +8); r1 = *(u64 *)(r1 +0);" ::: __clobber_all); + return 0; +} From 429fde2d81bcef0ebab002215358955704586457 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2024 22:22:47 +0000 Subject: [PATCH 259/330] net: tun: fix tun_napi_alloc_frags() syzbot reported the following crash [1] Issue came with the blamed commit. Instead of going through all the iov components, we keep using the first one and end up with a malformed skb. [1] kernel BUG at net/core/skbuff.c:2849 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6230 Comm: syz-executor132 Not tainted 6.13.0-rc1-syzkaller-00407-g96b6fcc0ee41 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:__pskb_pull_tail+0x1568/0x1570 net/core/skbuff.c:2848 Code: 38 c1 0f 8c 32 f1 ff ff 4c 89 f7 e8 92 96 74 f8 e9 25 f1 ff ff e8 e8 ae 09 f8 48 8b 5c 24 08 e9 eb fb ff ff e8 d9 ae 09 f8 90 <0f> 0b 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffc90004cbef30 EFLAGS: 00010293 RAX: ffffffff8995c347 RBX: 00000000fffffff2 RCX: ffff88802cf45a00 RDX: 0000000000000000 RSI: 00000000fffffff2 RDI: 0000000000000000 RBP: ffff88807df0c06a R08: ffffffff8995b084 R09: 1ffff1100fbe185c R10: dffffc0000000000 R11: ffffed100fbe185d R12: ffff888076e85d50 R13: ffff888076e85c80 R14: ffff888076e85cf4 R15: ffff888076e85c80 FS: 00007f0dca6ea6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0dca6ead58 CR3: 00000000119da000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_cow_data+0x2da/0xcb0 net/core/skbuff.c:5284 tipc_aead_decrypt net/tipc/crypto.c:894 [inline] tipc_crypto_rcv+0x402/0x24e0 net/tipc/crypto.c:1844 tipc_rcv+0x57e/0x12a0 net/tipc/node.c:2109 tipc_l2_rcv_msg+0x2bd/0x450 net/tipc/bearer.c:668 __netif_receive_skb_list_ptype net/core/dev.c:5720 [inline] __netif_receive_skb_list_core+0x8b7/0x980 net/core/dev.c:5762 __netif_receive_skb_list net/core/dev.c:5814 [inline] netif_receive_skb_list_internal+0xa51/0xe30 net/core/dev.c:5905 gro_normal_list include/net/gro.h:515 [inline] napi_complete_done+0x2b5/0x870 net/core/dev.c:6256 napi_complete include/linux/netdevice.h:567 [inline] tun_get_user+0x2ea0/0x4890 drivers/net/tun.c:1982 tun_chr_write_iter+0x10d/0x1f0 drivers/net/tun.c:2057 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_writev+0x1b6/0x360 fs/read_write.c:1096 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: de4f5fed3f23 ("iov_iter: add iter_iovec() helper") Reported-by: syzbot+4f66250f6663c0c1d67e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/675b61aa.050a0220.599f4.00bb.GAE@google.com/T/#u Cc: stable@vger.kernel.org Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Jens Axboe Acked-by: Willem de Bruijn Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20241212222247.724674-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d7a865ef370b6..e816aaba8e5f2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1481,7 +1481,7 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - const struct iovec *iov = iter_iov(it); + const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; From c83508da5620ef89232cb614fb9e02dfdfef2b8f Mon Sep 17 00:00:00 2001 From: Priya Bala Govindasamy Date: Fri, 13 Dec 2024 17:58:58 -0800 Subject: [PATCH 260/330] bpf: Avoid deadlock caused by nested kprobe and fentry bpf programs BPF program types like kprobe and fentry can cause deadlocks in certain situations. If a function takes a lock and one of these bpf programs is hooked to some point in the function's critical section, and if the bpf program tries to call the same function and take the same lock it will lead to deadlock. These situations have been reported in the following bug reports. In percpu_freelist - Link: https://lore.kernel.org/bpf/CAADnVQLAHwsa+2C6j9+UC6ScrDaN9Fjqv1WjB1pP9AzJLhKuLQ@mail.gmail.com/T/ Link: https://lore.kernel.org/bpf/CAPPBnEYm+9zduStsZaDnq93q1jPLqO-PiKX9jy0MuL8LCXmCrQ@mail.gmail.com/T/ In bpf_lru_list - Link: https://lore.kernel.org/bpf/CAPPBnEajj+DMfiR_WRWU5=6A7KKULdB5Rob_NJopFLWF+i9gCA@mail.gmail.com/T/ Link: https://lore.kernel.org/bpf/CAPPBnEZQDVN6VqnQXvVqGoB+ukOtHGZ9b9U0OLJJYvRoSsMY_g@mail.gmail.com/T/ Link: https://lore.kernel.org/bpf/CAPPBnEaCB1rFAYU7Wf8UxqcqOWKmRPU1Nuzk3_oLk6qXR7LBOA@mail.gmail.com/T/ Similar bugs have been reported by syzbot. In queue_stack_maps - Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/ Link: https://lore.kernel.org/all/20240418230932.2689-1-hdanton@sina.com/T/ In lpm_trie - Link: https://lore.kernel.org/linux-kernel/00000000000035168a061a47fa38@google.com/T/ In ringbuf - Link: https://lore.kernel.org/bpf/20240313121345.2292-1-hdanton@sina.com/T/ Prevent kprobe and fentry bpf programs from attaching to these critical sections by removing CC_FLAGS_FTRACE for percpu_freelist.o, bpf_lru_list.o, queue_stack_maps.o, lpm_trie.o, ringbuf.o files. The bugs reported by syzbot are due to tracepoint bpf programs being called in the critical sections. This patch does not aim to fix deadlocks caused by tracepoint programs. However, it does prevent deadlocks from occurring in similar situations due to kprobe and fentry programs. Signed-off-by: Priya Bala Govindasamy Link: https://lore.kernel.org/r/CAPPBnEZpjGnsuA26Mf9kYibSaGLm=oF6=12L21X1GEQdqjLnzQ@mail.gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9762bdddf1deb..410028633621c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -53,3 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o + +CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) From 2b33eb8f1b3e8c2f87cfdbc8cc117f6bdfabc6ec Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:16 +0800 Subject: [PATCH 261/330] net/smc: protect link down work from execute after lgr freed link down work may be scheduled before lgr freed but execute after lgr freed, which may result in crash. So it is need to hold a reference before shedule link down work, and put the reference after work executed or canceled. The relevant crash call stack as follows: list_del corruption. prev->next should be ffffb638c9c0fe20, but was 0000000000000000 ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:51! invalid opcode: 0000 [#1] SMP NOPTI CPU: 6 PID: 978112 Comm: kworker/6:119 Kdump: loaded Tainted: G #1 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 2221b89 04/01/2014 Workqueue: events smc_link_down_work [smc] RIP: 0010:__list_del_entry_valid.cold+0x31/0x47 RSP: 0018:ffffb638c9c0fdd8 EFLAGS: 00010086 RAX: 0000000000000054 RBX: ffff942fb75e5128 RCX: 0000000000000000 RDX: ffff943520930aa0 RSI: ffff94352091fc80 RDI: ffff94352091fc80 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffb638c9c0fc38 R10: ffffb638c9c0fc30 R11: ffffffffa015eb28 R12: 0000000000000002 R13: ffffb638c9c0fe20 R14: 0000000000000001 R15: ffff942f9cd051c0 FS: 0000000000000000(0000) GS:ffff943520900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4f25214000 CR3: 000000025fbae004 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: rwsem_down_write_slowpath+0x17e/0x470 smc_link_down_work+0x3c/0x60 [smc] process_one_work+0x1ac/0x350 worker_thread+0x49/0x2f0 ? rescuer_thread+0x360/0x360 kthread+0x118/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x1f/0x30 Fixes: 541afa10c126 ("net/smc: add smcr_port_err() and smcr_link_down() processing") Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 500952c2e67ba..3b125d348b4ae 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1818,7 +1818,9 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); - schedule_work(&lnk->link_down_wrk); + smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ + if (!schedule_work(&lnk->link_down_wrk)) + smcr_link_put(lnk); } } @@ -1850,11 +1852,14 @@ static void smc_link_down_work(struct work_struct *work) struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) - return; + goto out; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); + +out: + smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, From 679e9ddcf90dbdf98aaaa71a492454654b627bcb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:17 +0800 Subject: [PATCH 262/330] net/smc: check sndbuf_space again after NOSPACE flag is set in smc_poll When application sending data more than sndbuf_space, there have chances application will sleep in epoll_wait, and will never be wakeup again. This is caused by a race between smc_poll and smc_cdc_tx_handler. application tasklet smc_tx_sendmsg(len > sndbuf_space) | epoll_wait for EPOLL_OUT,timeout=0 | smc_poll | if (!smc->conn.sndbuf_space) | | smc_cdc_tx_handler | atomic_add sndbuf_space | smc_tx_sndbuf_nonfull | if (!test_bit SOCK_NOSPACE) | do not sk_write_space; set_bit SOCK_NOSPACE; | return mask=0; | Application will sleep in epoll_wait as smc_poll returns 0. And smc_cdc_tx_handler will not call sk_write_space because the SOCK_NOSPACE has not be set. If there is no inflight cdc msg, sk_write_space will not be called any more, and application will sleep in epoll_wait forever. So check sndbuf_space again after NOSPACE flag is set to break the race. Fixes: 8dce2786a290 ("net/smc: smc_poll improvements") Signed-off-by: Guangguan Wang Suggested-by: Paolo Abeni Signed-off-by: David S. Miller --- net/smc/af_smc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9e6c69d18581c..92448f2c362cb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2881,6 +2881,13 @@ __poll_t smc_poll(struct file *file, struct socket *sock, } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_state != SMC_INIT) { + /* Race breaker the same way as tcp_poll(). */ + smp_mb__after_atomic(); + if (atomic_read(&smc->conn.sndbuf_space)) + mask |= EPOLLOUT | EPOLLWRNORM; + } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; From a29e220d3c8edbf0e1beb0f028878a4a85966556 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:18 +0800 Subject: [PATCH 263/330] net/smc: check iparea_offset and ipv6_prefixes_cnt when receiving proposal msg When receiving proposal msg in server, the field iparea_offset and the field ipv6_prefixes_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field iparea_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks iparea_offset and ipv6_prefixes_cnt before using them. Fixes: e7b7a64a8493 ("smc: support variable CLC proposal messages") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 +++++- net/smc/smc_clc.c | 4 ++++ net/smc/smc_clc.h | 6 +++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 92448f2c362cb..9a74c9693f09e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2032,6 +2032,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx) + return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -2221,7 +2223,9 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, int rc = 0; /* check if ISM V1 is available */ - if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + if (!(ini->smcd_version & SMC_V1) || + !smcd_indicated(ini->smc_type_v1) || + !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 33fa787c28ebb..66a43b97eede0 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -354,6 +354,10 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx || + pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) + return false; + if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fd6f5b8ef034..ac8de6a177fac 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -336,8 +336,12 @@ struct smc_clc_msg_decline_v2 { /* clc decline message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { + u16 offset = ntohs(pclc->iparea_offset); + + if (offset > sizeof(struct smc_clc_msg_smcd)) + return NULL; return (struct smc_clc_msg_proposal_prefix *) - ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); + ((u8 *)pclc + sizeof(*pclc) + offset); } static inline bool smcr_indicated(int smc_type) From 7863c9f3d24ba49dbead7e03dfbe40deb5888fdf Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:19 +0800 Subject: [PATCH 264/330] net/smc: check v2_ext_offset/eid_cnt/ism_gid_cnt when receiving proposal msg When receiving proposal msg in server, the fields v2_ext_offset/ eid_cnt/ism_gid_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field v2_ext_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the fields v2_ext_offset/eid_cnt/ism_gid_cnt before using them. Fixes: 8c3dca341aea ("net/smc: build and send V2 CLC proposal") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 3 ++- net/smc/smc_clc.c | 8 +++++++- net/smc/smc_clc.h | 8 +++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9a74c9693f09e..5d96f9de5b5df 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2276,7 +2276,8 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); - if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + if (!smc_v2_ext || + !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 66a43b97eede0..f721d03efcbd7 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -352,7 +352,6 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; - v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx || pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) @@ -369,6 +368,13 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) sizeof(struct smc_clc_msg_trail)) return false; } else { + v2_ext = smc_get_clc_v2_ext(pclc); + if ((hdr->typev2 != SMC_TYPE_N && + (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || + (smcd_indicated(hdr->typev2) && + v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) + return false; + if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ac8de6a177fac..23afa4df862ed 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -380,8 +380,14 @@ static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + u16 max_offset; - if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset) || + ntohs(prop_smcd->v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_v2_extension *) From 9ab332deb671d8f7e66d82a2ff2b3f715bc3a4ad Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:20 +0800 Subject: [PATCH 265/330] net/smc: check smcd_v2_ext_offset when receiving proposal msg When receiving proposal msg in server, the field smcd_v2_ext_offset in proposal msg is from the remote client and can not be fully trusted. Once the value of smcd_v2_ext_offset exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the value of smcd_v2_ext_offset before using it. Fixes: 5c21c4ccafe8 ("net/smc: determine accepted ISM devices") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ net/smc/smc_clc.h | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5d96f9de5b5df..6cc7b846cff1b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2147,6 +2147,8 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) + goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 23afa4df862ed..7672899254108 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -400,9 +400,15 @@ smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { + u16 max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_smcd_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_v2_extension, hdr) - + offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset); + if (!prop_v2ext) return NULL; - if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) || + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_smcd_v2_extension *) From c5b8ee5022a19464783058dc6042e8eefa34e8cd Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:21 +0800 Subject: [PATCH 266/330] net/smc: check return value of sock_recvmsg when draining clc data When receiving clc msg, the field length in smc_clc_msg_hdr indicates the length of msg should be received from network and the value should not be fully trusted as it is from the network. Once the value of length exceeds the value of buflen in function smc_clc_wait_msg it may run into deadloop when trying to drain the remaining data exceeding buflen. This patch checks the return value of sock_recvmsg when draining data in case of deadloop in draining. Fixes: fb4f79264c0f ("net/smc: tolerate future SMCD versions") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index f721d03efcbd7..521f5df80e10c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,6 +774,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { From 2d5df3a680ffdaf606baa10636bdb1daf757832e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 12 Dec 2024 18:55:45 +0200 Subject: [PATCH 267/330] net: mscc: ocelot: fix incorrect IFH SRC_PORT field in ocelot_ifh_set_basic() Packets injected by the CPU should have a SRC_PORT field equal to the CPU port module index in the Analyzer block (ocelot->num_phys_ports). The blamed commit copied the ocelot_ifh_set_basic() call incorrectly from ocelot_xmit_common() in net/dsa/tag_ocelot.c. Instead of calling with "x", it calls with BIT_ULL(x), but the field is not a port mask, but rather a single port index. [ side note: this is the technical debt of code duplication :( ] The error used to be silent and doesn't appear to have other user-visible manifestations, but with new changes in the packing library, it now fails loudly as follows: ------------[ cut here ]------------ Cannot store 0x40 inside bits 46-43 - will truncate sja1105 spi2.0: xmit timed out WARNING: CPU: 1 PID: 102 at lib/packing.c:98 __pack+0x90/0x198 sja1105 spi2.0: timed out polling for tstamp CPU: 1 UID: 0 PID: 102 Comm: felix_xmit Tainted: G W N 6.13.0-rc1-00372-gf706b85d972d-dirty #2605 Call trace: __pack+0x90/0x198 (P) __pack+0x90/0x198 (L) packing+0x78/0x98 ocelot_ifh_set_basic+0x260/0x368 ocelot_port_inject_frame+0xa8/0x250 felix_port_deferred_xmit+0x14c/0x258 kthread_worker_fn+0x134/0x350 kthread+0x114/0x138 The code path pertains to the ocelot switchdev driver and to the felix secondary DSA tag protocol, ocelot-8021q. Here seen with ocelot-8021q. The messenger (packing) is not really to blame, so fix the original commit instead. Fixes: e1b9e80236c5 ("net: mscc: ocelot: fix QoS class for injected packets with "ocelot-8021q"") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241212165546.879567-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 3d72aa7b13050..ef93df5208871 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1432,7 +1432,7 @@ void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port, memset(ifh, 0, OCELOT_TAG_LEN); ocelot_ifh_set_bypass(ifh, 1); - ocelot_ifh_set_src(ifh, BIT_ULL(ocelot->num_phys_ports)); + ocelot_ifh_set_src(ifh, ocelot->num_phys_ports); ocelot_ifh_set_dest(ifh, BIT_ULL(port)); ocelot_ifh_set_qos_class(ifh, qos_class); ocelot_ifh_set_tag_type(ifh, tag_type); From ee76746387f6233bdfa93d7406990f923641568f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Dec 2024 17:25:18 +0000 Subject: [PATCH 268/330] netdevsim: prevent bad user input in nsim_dev_health_break_write() If either a zero count or a large one is provided, kernel can crash. Fixes: 82c93a87bf8b ("netdevsim: implement couple of testing devlink health reporters") Reported-by: syzbot+ea40e4294e58b0292f74@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/675c6862.050a0220.37aaf.00b1.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213172518.2415666-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/health.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 70e8bdf34be90..688f05316b5e1 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -149,6 +149,8 @@ static ssize_t nsim_dev_health_break_write(struct file *file, char *break_msg; int err; + if (count == 0 || count > PAGE_SIZE) + return -EINVAL; break_msg = memdup_user_nul(data, count); if (IS_ERR(break_msg)) return PTR_ERR(break_msg); From 663ad7481f068057f6f692c5368c47150e855370 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 13 Dec 2024 13:07:11 +0000 Subject: [PATCH 269/330] tools/net/ynl: fix sub-message key lookup for nested attributes Use the correct attribute space for sub-message key lookup in nested attributes when adding attributes. This fixes rt_link where the "kind" key and "data" sub-message are nested attributes in "linkinfo". For example: ./tools/net/ynl/cli.py \ --create \ --spec Documentation/netlink/specs/rt_link.yaml \ --do newlink \ --json '{"link": 99, "linkinfo": { "kind": "vlan", "data": {"id": 4 } } }' Signed-off-by: Donald Hunter Fixes: ab463c4342d1 ("tools/net/ynl: Add support for encoding sub-messages") Link: https://patch.msgid.link/20241213130711.40267-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 01ec01a90e763..eea29359a899a 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -556,10 +556,10 @@ def _add_attr(self, space, name, value, search_attrs): if attr["type"] == 'nest': nl_type |= Netlink.NLA_F_NESTED attr_payload = b'' - sub_attrs = SpaceAttrs(self.attr_sets[space], value, search_attrs) + sub_space = attr['nested-attributes'] + sub_attrs = SpaceAttrs(self.attr_sets[sub_space], value, search_attrs) for subname, subvalue in value.items(): - attr_payload += self._add_attr(attr['nested-attributes'], - subname, subvalue, sub_attrs) + attr_payload += self._add_attr(sub_space, subname, subvalue, sub_attrs) elif attr["type"] == 'flag': if not value: # If value is absent or false then skip attribute creation. From 9590d32e090ea2751e131ae5273859ca22f5ac14 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 12 Dec 2024 13:31:55 -0800 Subject: [PATCH 270/330] ionic: Fix netdev notifier unregister on failure If register_netdev() fails, then the driver leaks the netdev notifier. Fix this by calling ionic_lif_unregister() on register_netdev() failure. This will also call ionic_lif_unregister_phc() if it has already been registered. Fixes: 30b87ab4c0b3 ("ionic: remove lif list concept") Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-2-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 40496587b2b31..3d3f936779f7d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3869,8 +3869,8 @@ int ionic_lif_register(struct ionic_lif *lif) /* only register LIF0 for now */ err = register_netdev(lif->netdev); if (err) { - dev_err(lif->ionic->dev, "Cannot register net device, aborting\n"); - ionic_lif_unregister_phc(lif); + dev_err(lif->ionic->dev, "Cannot register net device: %d, aborting\n", err); + ionic_lif_unregister(lif); return err; } From 746e6ae2e202b062b9deee7bd86d94937997ecd7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Dec 2024 13:31:56 -0800 Subject: [PATCH 271/330] ionic: no double destroy workqueue There are some FW error handling paths that can cause us to try to destroy the workqueue more than once, so let's be sure we're checking for that. The case where this popped up was in an AER event where the handlers got called in such a way that ionic_reset_prepare() and thus ionic_dev_teardown() got called twice in a row. The second time through the workqueue was already destroyed, and destroy_workqueue() choked on the bad wq pointer. We didn't hit this in AER handler testing before because at that time we weren't using a private workqueue. Later we replaced the use of the system workqueue with our own private workqueue but hadn't rerun the AER handler testing since then. Fixes: 9e25450da700 ("ionic: add private workqueue per-device") Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-3-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 9e42d599840de..57edcde9e6f8c 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -277,7 +277,10 @@ void ionic_dev_teardown(struct ionic *ionic) idev->phy_cmb_pages = 0; idev->cmb_npages = 0; - destroy_workqueue(ionic->wq); + if (ionic->wq) { + destroy_workqueue(ionic->wq); + ionic->wq = NULL; + } mutex_destroy(&idev->cmb_inuse_lock); } From b096d62ba1323391b2db98b7704e2468cf3b1588 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Dec 2024 13:31:57 -0800 Subject: [PATCH 272/330] ionic: use ee->offset when returning sprom data Some calls into ionic_get_module_eeprom() don't use a single full buffer size, but instead multiple calls with an offset. Teach our driver to use the offset correctly so we can respond appropriately to the caller. Fixes: 4d03e00a2140 ("ionic: Add initial ethtool support") Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-4-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index dda22fa4448cf..9b7f78b6cdb1e 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -961,8 +961,8 @@ static int ionic_get_module_eeprom(struct net_device *netdev, len = min_t(u32, sizeof(xcvr->sprom), ee->len); do { - memcpy(data, xcvr->sprom, len); - memcpy(tbuf, xcvr->sprom, len); + memcpy(data, &xcvr->sprom[ee->offset], len); + memcpy(tbuf, &xcvr->sprom[ee->offset], len); /* Let's make sure we got a consistent copy */ if (!memcmp(data, tbuf, len)) From 282da38b465395c930687974627c24f47ddce5ff Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 10 Dec 2024 12:35:34 +0100 Subject: [PATCH 273/330] s390/mm: Consider KMSAN modules metadata for paging levels The calculation determining whether to use three- or four-level paging didn't account for KMSAN modules metadata. Include this metadata in the virtual memory size calculation to ensure correct paging mode selection and avoiding potentially unnecessary physical memory size limitations. Fixes: 65ca73f9fb36 ("s390/mm: define KMSAN metadata for vmalloc and modules") Acked-by: Heiko Carstens Reviewed-by: Alexander Gordeev Reviewed-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index abe6e6c0ab986..6087d38c72351 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -234,6 +234,8 @@ static unsigned long get_vmem_size(unsigned long identity_size, vsize = round_up(SZ_2G + max_mappable, rte_size) + round_up(vmemmap_size, rte_size) + FIXMAP_SIZE + MODULES_LEN + KASLR_LEN; + if (IS_ENABLED(CONFIG_KMSAN)) + vsize += MODULES_LEN * 2; return size_add(vsize, vmalloc_size); } From 922b4b955a03d19fea98938f33ef0e62d01f5159 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 12 Dec 2024 11:25:58 +0500 Subject: [PATCH 274/330] net: renesas: rswitch: rework ts tags management The existing linked list based implementation of how ts tags are assigned and managed is unsafe against concurrency and corner cases: - element addition in tx processing can race against element removal in ts queue completion, - element removal in ts queue completion can race against element removal in device close, - if a large number of frames gets added to tx queue without ts queue completions in between, elements with duplicate tag values can get added. Use a different implementation, based on per-port used tags bitmaps and saved skb arrays. Safety for addition in tx processing vs removal in ts completion is provided by: tag = find_first_zero_bit(...); smp_mb(); ts_skb[tag]> set_bit(...); vs ts_skb[tag]> smp_mb(); clear_bit(...); Safety for removal in ts completion vs removal in device close is provided by using atomic read-and-clear for rdev->ts_skb[tag]: ts_skb = xchg(&rdev->ts_skb[tag], NULL); if (ts_skb) Fixes: 33f5d733b589 ("net: renesas: rswitch: Improve TX timestamp accuracy") Signed-off-by: Nikita Yushchenko Link: https://patch.msgid.link/20241212062558.436455-1-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 74 ++++++++++++++------------ drivers/net/ethernet/renesas/rswitch.h | 13 ++--- 2 files changed, 42 insertions(+), 45 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index dbbbf024e7abc..9ac6e2aad18f6 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -547,7 +547,6 @@ static int rswitch_gwca_ts_queue_alloc(struct rswitch_private *priv) desc = &gq->ts_ring[gq->ring_size]; desc->desc.die_dt = DT_LINKFIX; rswitch_desc_set_dptr(&desc->desc, gq->ring_dma); - INIT_LIST_HEAD(&priv->gwca.ts_info_list); return 0; } @@ -1003,9 +1002,10 @@ static int rswitch_gwca_request_irqs(struct rswitch_private *priv) static void rswitch_ts(struct rswitch_private *priv) { struct rswitch_gwca_queue *gq = &priv->gwca.ts_queue; - struct rswitch_gwca_ts_info *ts_info, *ts_info2; struct skb_shared_hwtstamps shhwtstamps; struct rswitch_ts_desc *desc; + struct rswitch_device *rdev; + struct sk_buff *ts_skb; struct timespec64 ts; unsigned int num; u32 tag, port; @@ -1015,23 +1015,28 @@ static void rswitch_ts(struct rswitch_private *priv) dma_rmb(); port = TS_DESC_DPN(__le32_to_cpu(desc->desc.dptrl)); - tag = TS_DESC_TSUN(__le32_to_cpu(desc->desc.dptrl)); - - list_for_each_entry_safe(ts_info, ts_info2, &priv->gwca.ts_info_list, list) { - if (!(ts_info->port == port && ts_info->tag == tag)) - continue; - - memset(&shhwtstamps, 0, sizeof(shhwtstamps)); - ts.tv_sec = __le32_to_cpu(desc->ts_sec); - ts.tv_nsec = __le32_to_cpu(desc->ts_nsec & cpu_to_le32(0x3fffffff)); - shhwtstamps.hwtstamp = timespec64_to_ktime(ts); - skb_tstamp_tx(ts_info->skb, &shhwtstamps); - dev_consume_skb_irq(ts_info->skb); - list_del(&ts_info->list); - kfree(ts_info); - break; - } + if (unlikely(port >= RSWITCH_NUM_PORTS)) + goto next; + rdev = priv->rdev[port]; + tag = TS_DESC_TSUN(__le32_to_cpu(desc->desc.dptrl)); + if (unlikely(tag >= TS_TAGS_PER_PORT)) + goto next; + ts_skb = xchg(&rdev->ts_skb[tag], NULL); + smp_mb(); /* order rdev->ts_skb[] read before bitmap update */ + clear_bit(tag, rdev->ts_skb_used); + + if (unlikely(!ts_skb)) + goto next; + + memset(&shhwtstamps, 0, sizeof(shhwtstamps)); + ts.tv_sec = __le32_to_cpu(desc->ts_sec); + ts.tv_nsec = __le32_to_cpu(desc->ts_nsec & cpu_to_le32(0x3fffffff)); + shhwtstamps.hwtstamp = timespec64_to_ktime(ts); + skb_tstamp_tx(ts_skb, &shhwtstamps); + dev_consume_skb_irq(ts_skb); + +next: gq->cur = rswitch_next_queue_index(gq, true, 1); desc = &gq->ts_ring[gq->cur]; } @@ -1576,8 +1581,9 @@ static int rswitch_open(struct net_device *ndev) static int rswitch_stop(struct net_device *ndev) { struct rswitch_device *rdev = netdev_priv(ndev); - struct rswitch_gwca_ts_info *ts_info, *ts_info2; + struct sk_buff *ts_skb; unsigned long flags; + unsigned int tag; netif_tx_stop_all_queues(ndev); @@ -1594,12 +1600,13 @@ static int rswitch_stop(struct net_device *ndev) if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDID); - list_for_each_entry_safe(ts_info, ts_info2, &rdev->priv->gwca.ts_info_list, list) { - if (ts_info->port != rdev->port) - continue; - dev_kfree_skb_irq(ts_info->skb); - list_del(&ts_info->list); - kfree(ts_info); + for (tag = find_first_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT); + tag < TS_TAGS_PER_PORT; + tag = find_next_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT, tag + 1)) { + ts_skb = xchg(&rdev->ts_skb[tag], NULL); + clear_bit(tag, rdev->ts_skb_used); + if (ts_skb) + dev_kfree_skb(ts_skb); } return 0; @@ -1612,20 +1619,17 @@ static bool rswitch_ext_desc_set_info1(struct rswitch_device *rdev, desc->info1 = cpu_to_le64(INFO1_DV(BIT(rdev->etha->index)) | INFO1_IPV(GWCA_IPV_NUM) | INFO1_FMT); if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { - struct rswitch_gwca_ts_info *ts_info; + unsigned int tag; - ts_info = kzalloc(sizeof(*ts_info), GFP_ATOMIC); - if (!ts_info) + tag = find_first_zero_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT); + if (tag == TS_TAGS_PER_PORT) return false; + smp_mb(); /* order bitmap read before rdev->ts_skb[] write */ + rdev->ts_skb[tag] = skb_get(skb); + set_bit(tag, rdev->ts_skb_used); skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; - rdev->ts_tag++; - desc->info1 |= cpu_to_le64(INFO1_TSUN(rdev->ts_tag) | INFO1_TXC); - - ts_info->skb = skb_get(skb); - ts_info->port = rdev->port; - ts_info->tag = rdev->ts_tag; - list_add_tail(&ts_info->list, &rdev->priv->gwca.ts_info_list); + desc->info1 |= cpu_to_le64(INFO1_TSUN(tag) | INFO1_TXC); skb_tx_timestamp(skb); } diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index e020800dcc570..d8d4ed7d7f8b6 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -972,14 +972,6 @@ struct rswitch_gwca_queue { }; }; -struct rswitch_gwca_ts_info { - struct sk_buff *skb; - struct list_head list; - - int port; - u8 tag; -}; - #define RSWITCH_NUM_IRQ_REGS (RSWITCH_MAX_NUM_QUEUES / BITS_PER_TYPE(u32)) struct rswitch_gwca { unsigned int index; @@ -989,7 +981,6 @@ struct rswitch_gwca { struct rswitch_gwca_queue *queues; int num_queues; struct rswitch_gwca_queue ts_queue; - struct list_head ts_info_list; DECLARE_BITMAP(used, RSWITCH_MAX_NUM_QUEUES); u32 tx_irq_bits[RSWITCH_NUM_IRQ_REGS]; u32 rx_irq_bits[RSWITCH_NUM_IRQ_REGS]; @@ -997,6 +988,7 @@ struct rswitch_gwca { }; #define NUM_QUEUES_PER_NDEV 2 +#define TS_TAGS_PER_PORT 256 struct rswitch_device { struct rswitch_private *priv; struct net_device *ndev; @@ -1004,7 +996,8 @@ struct rswitch_device { void __iomem *addr; struct rswitch_gwca_queue *tx_queue; struct rswitch_gwca_queue *rx_queue; - u8 ts_tag; + struct sk_buff *ts_skb[TS_TAGS_PER_PORT]; + DECLARE_BITMAP(ts_skb_used, TS_TAGS_PER_PORT); bool disabled; int port; From 78d4f34e2115b517bcbfe7ec0d018bbbb6f9b0b8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Dec 2024 15:58:23 -0800 Subject: [PATCH 275/330] Linux 6.13-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 64c594bd7ad03..e5b8a8832c0ca 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* From 900f83cf376bdaf798b6f5dcb2eae0c822e908b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 5 Dec 2024 12:09:19 +1100 Subject: [PATCH 276/330] selinux: ignore unknown extended permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When evaluating extended permissions, ignore unknown permissions instead of calling BUG(). This commit ensures that future permissions can be added without interfering with older kernels. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Signed-off-by: Thiébaud Weksteen Signed-off-by: Paul Moore --- security/selinux/ss/services.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 971c45d576ba1..3d5c563cfc4c8 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -979,7 +979,10 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, return; break; default: - BUG(); + pr_warn_once( + "SELinux: unknown extended permission (%u) will be ignored\n", + node->datum.u.xperms->specified); + return; } if (node->key.specified == AVTAB_XPERMS_ALLOWED) { @@ -998,7 +1001,8 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, &node->datum.u.xperms->perms, xpermd->dontaudit); } else { - BUG(); + pr_warn_once("SELinux: unknown specified key (%u)\n", + node->key.specified); } } From 83c47d9e0ce79b5d7c0b21b9f35402dbde0fa15c Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Sat, 14 Dec 2024 12:16:45 +0900 Subject: [PATCH 277/330] ksmbd: count all requests in req_running counter This changes the semantics of req_running to count all in-flight requests on a given connection, rather than the number of elements in the conn->request list. The latter is used only in smb2_cancel, and the counter is not used Signed-off-by: Marios Makassikis Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c14dd72e1b301..be9656d524a4c 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -120,8 +120,8 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) requests_queue = &conn->requests; + atomic_inc(&conn->req_running); if (requests_queue) { - atomic_inc(&conn->req_running); spin_lock(&conn->request_lock); list_add_tail(&work->request_entry, requests_queue); spin_unlock(&conn->request_lock); @@ -132,11 +132,12 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + atomic_dec(&conn->req_running); + if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) return; - atomic_dec(&conn->req_running); spin_lock(&conn->request_lock); list_del_init(&work->request_entry); spin_unlock(&conn->request_lock); From 43fb7bce8866e793275c4f9f25af6a37745f3416 Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Sat, 14 Dec 2024 12:17:23 +0900 Subject: [PATCH 278/330] ksmbd: fix broken transfers when exceeding max simultaneous operations Since commit 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations"), ksmbd enforces a maximum number of simultaneous operations for a connection. The problem is that reaching the limit causes ksmbd to close the socket, and the client has no indication that it should have slowed down. This behaviour can be reproduced by setting "smb2 max credits = 128" (or lower), and transferring a large file (25GB). smbclient fails as below: $ smbclient //192.168.1.254/testshare -U user%pass smb: \> put file.bin cli_push returned NT_STATUS_USER_SESSION_DELETED putting file file.bin as \file.bin smb2cli_req_compound_submit: Insufficient credits. 0 available, 1 needed NT_STATUS_INTERNAL_ERROR closing remote file \file.bin smb: \> smb2cli_req_compound_submit: Insufficient credits. 0 available, 1 needed Windows clients fail with 0x8007003b (with smaller files even). Fix this by delaying reading from the socket until there's room to allocate a request. This effectively applies backpressure on the client, so the transfer completes, albeit at a slower rate. Fixes: 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations") Signed-off-by: Marios Makassikis Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 13 +++++++++++-- fs/smb/server/connection.h | 1 - fs/smb/server/server.c | 7 +------ fs/smb/server/server.h | 1 + fs/smb/server/transport_ipc.c | 5 ++++- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index be9656d524a4c..f8a40f65db6ae 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -70,7 +70,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); - atomic_set(&conn->mux_smb_requests, 0); conn->total_credits = 1; conn->outstanding_credits = 0; @@ -133,6 +132,8 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; atomic_dec(&conn->req_running); + if (waitqueue_active(&conn->req_running_q)) + wake_up(&conn->req_running_q); if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) @@ -309,7 +310,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size, max_allowed_pdu_size; + unsigned int pdu_size, max_allowed_pdu_size, max_req; char hdr_buf[4] = {0,}; int size; @@ -319,6 +320,7 @@ int ksmbd_conn_handler_loop(void *p) if (t->ops->prepare && t->ops->prepare(t)) goto out; + max_req = server_conf.max_inflight_req; conn->last_active = jiffies; set_freezable(); while (ksmbd_conn_alive(conn)) { @@ -328,6 +330,13 @@ int ksmbd_conn_handler_loop(void *p) kvfree(conn->request_buf); conn->request_buf = NULL; +recheck: + if (atomic_read(&conn->req_running) + 1 > max_req) { + wait_event_interruptible(conn->req_running_q, + atomic_read(&conn->req_running) < max_req); + goto recheck; + } + size = t->ops->read(t, hdr_buf, sizeof(hdr_buf), -1); if (size != sizeof(hdr_buf)) break; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 8ddd5a3c7bafb..b379ae4fdcdff 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -107,7 +107,6 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; - atomic_t mux_smb_requests; }; struct ksmbd_conn_ops { diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 3ba95bd8edeb2..601e7fcbcf1e6 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -270,7 +270,6 @@ static void handle_ksmbd_work(struct work_struct *wk) ksmbd_conn_try_dequeue_request(work); ksmbd_free_work_struct(work); - atomic_dec(&conn->mux_smb_requests); /* * Checking waitqueue to dropping pending requests on * disconnection. waitqueue_active is safe because it @@ -300,11 +299,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) if (err) return 0; - if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) { - atomic_dec_return(&conn->mux_smb_requests); - return -ENOSPC; - } - work = ksmbd_alloc_work_struct(); if (!work) { pr_err("allocation for work failed\n"); @@ -367,6 +361,7 @@ static int server_conf_init(void) server_conf.auth_mechs |= KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5; #endif + server_conf.max_inflight_req = SMB2_MAX_CREDITS; return 0; } diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 4fc529335271f..94187628ff089 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -42,6 +42,7 @@ struct ksmbd_server_config { struct smb_sid domain_sid; unsigned int auth_mechs; unsigned int max_connections; + unsigned int max_inflight_req; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 48cda3350e5a2..befaf42b84cc3 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -319,8 +319,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); - if (req->smb2_max_credits) + if (req->smb2_max_credits) { init_smb2_max_credits(req->smb2_max_credits); + server_conf.max_inflight_req = + req->smb2_max_credits; + } if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); From fe4ed2f09b492e3507615a053814daa8fafdecb1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 14 Dec 2024 12:19:03 +0900 Subject: [PATCH 279/330] ksmbd: conn lock to serialize smb2 negotiate If client send parallel smb2 negotiate request on same connection, ksmbd_conn can be racy. smb2 negotiate handling that are not performance-related can be serialized with conn lock. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 803b35b89513e..23e21845f9286 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1097,6 +1097,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) return rc; } + ksmbd_conn_lock(conn); smb2_buf_len = get_rfc1002_len(work->request_buf); smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects); if (smb2_neg_size > smb2_buf_len) { @@ -1247,6 +1248,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + ksmbd_conn_unlock(conn); if (rc) rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; From 7b00af2c5414dc01e0718deef7ead81102867636 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 16 Dec 2024 20:53:08 +0800 Subject: [PATCH 280/330] erofs: use `struct erofs_device_info` for the primary device Instead of just listing each one directly in `struct erofs_sb_info` except that we still use `sb->s_bdev` for the primary block device. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241216125310.930933-2-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 12 ++++-------- fs/erofs/fscache.c | 6 +++--- fs/erofs/internal.h | 8 ++------ fs/erofs/super.c | 27 +++++++++++++-------------- 4 files changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1c49f8962021f..622017c659587 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -56,10 +56,10 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) buf->file = NULL; if (erofs_is_fileio_mode(sbi)) { - buf->file = sbi->fdev; /* some fs like FUSE needs it */ + buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ buf->mapping = buf->file->f_mapping; } else if (erofs_is_fscache_mode(sb)) - buf->mapping = sbi->s_fscache->inode->i_mapping; + buf->mapping = sbi->dif0.fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -201,12 +201,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - map->m_bdev = sb->s_bdev; - map->m_daxdev = EROFS_SB(sb)->dax_dev; - map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; - map->m_fscache = EROFS_SB(sb)->s_fscache; - map->m_fp = EROFS_SB(sb)->fdev; - + erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0); + map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index fda16eedafb57..ce7e38c827198 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -657,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) if (IS_ERR(fscache)) return PTR_ERR(fscache); - sbi->s_fscache = fscache; + sbi->dif0.fscache = fscache; return 0; } @@ -665,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - erofs_fscache_unregister_cookie(sbi->s_fscache); + erofs_fscache_unregister_cookie(sbi->dif0.fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); - sbi->s_fscache = NULL; + sbi->dif0.fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1c847c30a918f..3e8d71d516f42 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -107,6 +107,7 @@ struct erofs_xattr_prefix_item { }; struct erofs_sb_info { + struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -124,13 +125,9 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ - struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; - struct dax_device *dax_dev; - u64 dax_part_off; u64 total_blocks; - u32 primarydevice_blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -166,7 +163,6 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; - struct erofs_fscache *s_fscache; struct erofs_domain *domain; char *fsid; char *domain_id; @@ -187,7 +183,7 @@ struct erofs_sb_info { static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) { - return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; } static inline bool erofs_is_fscache_mode(struct super_block *sb) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index de8e3ecc6381d..9044907354e12 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -203,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb, struct erofs_device_info *dif; int id, err = 0; - sbi->total_blocks = sbi->primarydevice_blocks; + sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else @@ -307,7 +307,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -602,9 +602,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -EINVAL; } - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off, - NULL, NULL); + sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); @@ -627,7 +626,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } if (test_opt(&sbi->opt, DAX_ALWAYS)) { - if (!sbi->dax_dev) { + if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { @@ -707,14 +706,13 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (!fc->source) return invalf(fc, "No source specified"); - file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); - sbi->fdev = file; + sbi->dif0.file = file; - if (S_ISREG(file_inode(sbi->fdev)->i_mode) && - sbi->fdev->f_mapping->a_ops->read_folio) + if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && + sbi->dif0.file->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); } #endif @@ -771,8 +769,8 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); + if (sbi->dif0.file) + fput(sbi->dif0.file); kfree(sbi); } @@ -817,11 +815,12 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || + sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); - fs_put_dax(sbi->dax_dev, NULL); + fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); sb->s_fs_info = NULL; From f8d920a402aec3482931cb5f1539ed438740fc49 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 13 Dec 2024 07:54:01 +0800 Subject: [PATCH 281/330] erofs: reference `struct erofs_device_info` for erofs_map_dev Record `m_sb` and `m_dif` to replace `m_fscache`, `m_daxdev`, `m_fp` and `m_dax_part_off` in order to simplify the codebase. Note that `m_bdev` is still left since it can be assigned from `sb->s_bdev` directly. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212235401.2857246-1-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 26 ++++++++++---------------- fs/erofs/fileio.c | 2 +- fs/erofs/fscache.c | 4 ++-- fs/erofs/internal.h | 6 ++---- 4 files changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 622017c659587..0cd6b5c4df985 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -179,19 +179,13 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) } static void erofs_fill_from_devinfo(struct erofs_map_dev *map, - struct erofs_device_info *dif) + struct super_block *sb, struct erofs_device_info *dif) { + map->m_sb = sb; + map->m_dif = dif; map->m_bdev = NULL; - map->m_fp = NULL; - if (dif->file) { - if (S_ISBLK(file_inode(dif->file)->i_mode)) - map->m_bdev = file_bdev(dif->file); - else - map->m_fp = dif->file; - } - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); } int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) @@ -201,7 +195,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0); + erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); @@ -215,7 +209,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); @@ -228,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); break; } } @@ -298,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->offset = map.m_la; if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_daxdev; + iomap->dax_dev = mdev.m_dif->dax_dev; else iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; @@ -327,7 +321,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dax_part_off; + iomap->addr += mdev.m_dif->dax_part_off; } return 0; } diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 3af96b1e2c2aa..a61b8faec6518 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -67,7 +67,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) GFP_KERNEL | __GFP_NOFAIL); bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); - rq->iocb.ki_filp = mdev->m_fp; + rq->iocb.ki_filp = mdev->m_dif->file; return rq; } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce7e38c827198..ce3d8737df85d 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -198,7 +198,7 @@ struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); - io->io.private = mdev->m_fscache->cookie; + io->io.private = mdev->m_dif->fscache->cookie; io->io.end_io = erofs_fscache_bio_endio; refcount_set(&io->io.ref, 1); return &io->bio; @@ -316,7 +316,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) if (!io) return -ENOMEM; iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); - ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie, + ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie, mdev.m_pa + (pos - map.m_la), io); erofs_fscache_req_io_put(io); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3e8d71d516f42..7cc8e1be04e8f 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -353,11 +353,9 @@ enum { }; struct erofs_map_dev { - struct erofs_fscache *m_fscache; + struct super_block *m_sb; + struct erofs_device_info *m_dif; struct block_device *m_bdev; - struct dax_device *m_daxdev; - struct file *m_fp; - u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; From 6422cde1b0d5a31b206b263417c1c2b3c80fe82c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:43:36 +0800 Subject: [PATCH 282/330] erofs: use buffered I/O for file-backed mounts by default For many use cases (e.g. container images are just fetched from remote), performance will be impacted if underlay page cache is up-to-date but direct i/o flushes dirty pages first. Instead, let's use buffered I/O by default to keep in sync with loop devices and add a (re)mount option to explicitly give a try to use direct I/O if supported by the underlying files. The container startup time is improved as below: [workload] docker.io/library/workpress:latest unpack 1st run non-1st runs EROFS snapshotter buffered I/O file 4.586404265s 0.308s 0.198s EROFS snapshotter direct I/O file 4.581742849s 2.238s 0.222s EROFS snapshotter loop 4.596023152s 0.346s 0.201s Overlayfs snapshotter 5.382851037s 0.206s 0.214s Fixes: fb176750266a ("erofs: add file-backed mount support") Cc: Derek McGowan Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212134336.2059899-1-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 7 +++++-- fs/erofs/internal.h | 1 + fs/erofs/super.c | 23 +++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index a61b8faec6518..33f8539dda4ae 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -9,6 +9,7 @@ struct erofs_fileio_rq { struct bio_vec bvecs[BIO_MAX_VECS]; struct bio bio; struct kiocb iocb; + struct super_block *sb; }; struct erofs_fileio { @@ -52,8 +53,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; rq->iocb.ki_ioprio = get_current_ioprio(); rq->iocb.ki_complete = erofs_fileio_ki_complete; - rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? - IOCB_DIRECT : 0; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) + rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); @@ -68,6 +70,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); rq->iocb.ki_filp = mdev->m_dif->file; + rq->sb = mdev->m_sb; return rq; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 7cc8e1be04e8f..686d835eb533a 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -176,6 +176,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9044907354e12..f5956474bfdea 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -364,14 +364,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi) } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_err }; @@ -398,6 +392,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), {} }; @@ -511,6 +506,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } @@ -948,6 +953,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); From cc252bb592638e0f7aea40d580186c36d89526b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Dec 2024 13:53:35 -0500 Subject: [PATCH 283/330] fgraph: Still initialize idle shadow stacks when starting A bug was discovered where the idle shadow stacks were not initialized for offline CPUs when starting function graph tracer, and when they came online they were not traced due to the missing shadow stack. To fix this, the idle task shadow stack initialization was moved to using the CPU hotplug callbacks. But it removed the initialization when the function graph was enabled. The problem here is that the hotplug callbacks are called when the CPUs come online, but the idle shadow stack initialization only happens if function graph is currently active. This caused the online CPUs to not get their shadow stack initialized. The idle shadow stack initialization still needs to be done when the function graph is registered, as they will not be allocated if function graph is not registered. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241211135335.094ba282@batman.local.home Fixes: 2c02f7375e65 ("fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks") Reported-by: Linus Walleij Tested-by: Linus Walleij Closes: https://lore.kernel.org/all/CACRpkdaTBrHwRbbrphVy-=SeDz6MSsXhTKypOtLrTQ+DgGAOcQ@mail.gmail.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 0bf78517b5d4c..ddedcb50917f4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1215,7 +1215,7 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret; + int ret, cpu; ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, sizeof(*ret_stack_list), GFP_KERNEL); @@ -1223,6 +1223,12 @@ static int start_graph_tracing(void) if (!ret_stack_list) return -ENOMEM; + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); From 166438a432d76c68d3f0da60667248f3c2303d6c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 16:46:33 -0500 Subject: [PATCH 284/330] ftrace: Do not find "true_parent" if HAVE_DYNAMIC_FTRACE_WITH_ARGS is not set When function tracing and function graph tracing are both enabled (in different instances) the "parent" of some of the function tracing events is "return_to_handler" which is the trampoline used by function graph tracing. To fix this, ftrace_get_true_parent_ip() was introduced that returns the "true" parent ip instead of the trampoline. To do this, the ftrace_regs_get_stack_pointer() is used, which uses kernel_stack_pointer(). The problem is that microblaze does not implement kerenl_stack_pointer() so when function graph tracing is enabled, the build fails. But microblaze also does not enabled HAVE_DYNAMIC_FTRACE_WITH_ARGS. That option has to be enabled by the architecture to reliably get the values from the fregs parameter passed in. When that config is not set, the architecture can also pass in NULL, which is not tested for in that function and could cause the kernel to crash. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Al Viro Cc: Michal Simek Cc: Jeff Xie Link: https://lore.kernel.org/20241216164633.6df18e87@gandalf.local.home Fixes: 60b1f578b578 ("ftrace: Get the true parent ip for function tracer") Reported-by: Al Viro Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 74c353164ca12..d358c9935164d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,7 +176,8 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) static __always_inline unsigned long function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) { From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: [PATCH 285/330] fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260a..e4ce1cae03bf7 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* From b1f3a2f5a742c1e939a73031bd31b9e557a2d77d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:40 -0800 Subject: [PATCH 286/330] netdev: fix repeated netlink messages in queue dump The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 102 != 100 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 101 != 100 not ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 not ok 1 selftests: drivers/net: queues.py # exit=1 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9527dd46e4dc3..9f086b1906194 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -488,24 +488,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { int err = 0; - int i; if (!(netdev->flags & IFF_UP)) return err; - for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; - ctx->rxq_idx = i++; } - for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; - ctx->txq_idx = i++; } return err; From ecc391a541573da46b7ccc188105efedd40aef1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:41 -0800 Subject: [PATCH 287/330] netdev: fix repeated netlink messages in queue stats The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 103 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 103 != 100 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 102 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 102 != 100 missing queue keys not ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:4 fail:1 xfail:0 xpass:0 skip:0 error:0 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: ab63a2387cb9 ("netdev: add per-queue statistics") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9f086b1906194..1be8c7c21d19d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -668,7 +668,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->rxq_idx = i++; + ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { @@ -676,7 +676,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->txq_idx = i++; + ctx->txq_idx = ++i; } ctx->rxq_idx = 0; From 0518863407b8dcc7070fdbc1c015046d66777e78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:42 -0800 Subject: [PATCH 288/330] selftests: net: support setting recv_size in YNL recv_size parameter allows constraining the buffer size for dumps. It's useful in testing kernel handling of dump continuation, IOW testing dumps which span multiple skbs. Let the tests set this parameter when initializing the YNL family. Keep the normal default, we don't want tests to unintentionally behave very differently than normal code. Reviewed-by: Joe Damato Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ynl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index a0d689d58c573..076a7e8dc3ebe 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -32,23 +32,23 @@ # Set schema='' to avoid jsonschema validation, it's slow # class EthtoolFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class RtnlFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class NetdevFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class NetshaperFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) From 1234810b1649e9d781aeafd4b23fb1fcfbf95d8f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:43 -0800 Subject: [PATCH 289/330] selftests: net-drv: queues: sanity check netlink dumps This test already catches a netlink bug fixed by this series, but only when running on HW with many queues. Make sure the netdevsim instance created has a lot of queues, and constrain the size of the recv_buffer used by netlink. While at it test both rx and tx queues. Reviewed-by: Joe Damato Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 30f29096e27c2..9c5473abbd784 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -8,25 +8,28 @@ import glob -def sys_get_queues(ifname) -> int: - folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*') +def sys_get_queues(ifname, qtype='rx') -> int: + folders = glob.glob(f'/sys/class/net/{ifname}/queues/{qtype}-*') return len(folders) -def nl_get_queues(cfg, nl): +def nl_get_queues(cfg, nl, qtype='rx'): queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) if queues: - return len([q for q in queues if q['type'] == 'rx']) + return len([q for q in queues if q['type'] == qtype]) return None def get_queues(cfg, nl) -> None: - queues = nl_get_queues(cfg, nl) - if not queues: - raise KsftSkipEx('queue-get not supported by device') + snl = NetdevFamily(recv_size=4096) - expected = sys_get_queues(cfg.dev['ifname']) - ksft_eq(queues, expected) + for qtype in ['rx', 'tx']: + queues = nl_get_queues(cfg, snl, qtype) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + expected = sys_get_queues(cfg.dev['ifname'], qtype) + ksft_eq(queues, expected) def addremove_queues(cfg, nl) -> None: @@ -57,7 +60,7 @@ def addremove_queues(cfg, nl) -> None: def main() -> None: - with NetDrvEnv(__file__, queue_count=3) as cfg: + with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) ksft_exit() From 5712e323d4c3ad03bba4d28f83e80593171ac3f1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:44 -0800 Subject: [PATCH 290/330] selftests: net-drv: stats: sanity check netlink dumps Sanity check netlink dumps, to make sure dumps don't have repeated entries or gaps in IDs. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/stats.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 63e3c045a3b25..031ac9def6c01 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -110,6 +110,23 @@ def qstat_by_ifindex(cfg) -> None: ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + # Sanity check the dumps + queues = NetdevFamily(recv_size=4096).qstats_get({"scope": "queue"}, dump=True) + # Reformat the output into {ifindex: {rx: [id, id, ...], tx: [id, id, ...]}} + parsed = {} + for entry in queues: + ifindex = entry["ifindex"] + if ifindex not in parsed: + parsed[ifindex] = {"rx":[], "tx": []} + parsed[ifindex][entry["queue-type"]].append(entry['queue-id']) + # Now, validate + for ifindex, queues in parsed.items(): + for qtype in ['rx', 'tx']: + ksft_eq(len(queues[qtype]), len(set(queues[qtype])), + comment="repeated queue keys") + ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, + comment="missing queue keys") + # Test invalid dumps # 0 is invalid with ksft_raises(NlError) as cm: @@ -158,7 +175,7 @@ def check_down(cfg) -> None: def main() -> None: - with NetDrvEnv(__file__) as cfg: + with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, check_down], args=(cfg, )) From fbbd84af6ba70334335bdeba3ae536cf751c14c6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Dec 2024 12:47:27 +0300 Subject: [PATCH 291/330] chelsio/chtls: prevent potential integer overflow on 32bit The "gl->tot_len" variable is controlled by the user. It comes from process_responses(). On 32bit systems, the "gl->tot_len + sizeof(struct cpl_pass_accept_req) + sizeof(struct rss_header)" addition could have an integer wrapping bug. Use size_add() to prevent this. Fixes: a08943947873 ("crypto: chtls - Register chtls with net tls") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/c6bfb23c-2db2-4e1b-b8ab-ba3925c82ef5@stanley.mountain Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c index 96fd31d75dfd9..daa1ebaef5112 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c @@ -346,8 +346,9 @@ static struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, * driver. Once driver synthesizes cpl_pass_accept_req the skb will go * through the regular cpl_pass_accept_req processing in TOM. */ - skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) - - pktshift, GFP_ATOMIC); + skb = alloc_skb(size_add(gl->tot_len, + sizeof(struct cpl_pass_accept_req)) - + pktshift, GFP_ATOMIC); if (unlikely(!skb)) return NULL; __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) From e78c20f327bd94dabac68b98218dff069a8780f0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Dec 2024 13:36:57 +0100 Subject: [PATCH 292/330] team: Fix feature exposure when no ports are present Small follow-up to align this to an equivalent behavior as the bond driver. The change in 3625920b62c3 ("teaming: fix vlan_features computing") removed the netdevice vlan_features when there is no team port attached, yet it leaves the full set of enc_features intact. Instead, leave the default features as pre 3625920b62c3, and recompute once we do have ports attached. Also, similarly as in bonding case, call the netdev_base_features() helper on the enc_features. Fixes: 3625920b62c3 ("teaming: fix vlan_features computing") Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241213123657.401868-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/team/team_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 69ea2c3c76bfc..c7690adec8db7 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -998,9 +998,13 @@ static void __team_compute_features(struct team *team) unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + rcu_read_lock(); + if (list_empty(&team->port_list)) + goto done; + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); - rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, port->dev->vlan_features, @@ -1010,11 +1014,11 @@ static void __team_compute_features(struct team *team) port->dev->hw_enc_features, TEAM_ENC_FEATURES); - dst_release_flag &= port->dev->priv_flags; if (port->dev->hard_header_len > max_hard_header_len) max_hard_header_len = port->dev->hard_header_len; } +done: rcu_read_unlock(); team->dev->vlan_features = vlan_features; From 7203d10e93b6e6e1d19481ef7907de6a9133a467 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Dec 2024 17:28:11 +0300 Subject: [PATCH 293/330] net: hinic: Fix cleanup in create_rxqs/txqs() There is a check for NULL at the start of create_txqs() and create_rxqs() which tess if "nic_dev->txqs" is non-NULL. The intention is that if the device is already open and the queues are already created then we don't create them a second time. However, the bug is that if we have an error in the create_txqs() then the pointer doesn't get set back to NULL. The NULL check at the start of the function will say that it's already open when it's not and the device can't be used. Set ->txqs back to NULL on cleanup on error. Fixes: c3e79baf1b03 ("net-next/hinic: Add logical Txq and Rxq") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/0cc98faf-a0ed-4565-a55b-0fa2734bc205@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/huawei/hinic/hinic_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 890f213da8d18..ae1f523d6841b 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -172,6 +172,7 @@ static int create_txqs(struct hinic_dev *nic_dev) hinic_sq_dbgfs_uninit(nic_dev); devm_kfree(&netdev->dev, nic_dev->txqs); + nic_dev->txqs = NULL; return err; } @@ -268,6 +269,7 @@ static int create_rxqs(struct hinic_dev *nic_dev) hinic_rq_dbgfs_uninit(nic_dev); devm_kfree(&netdev->dev, nic_dev->rxqs); + nic_dev->rxqs = NULL; return err; } From b4845bb6383821a9516ce30af3a27dc873e37fd4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 11:00:52 +0200 Subject: [PATCH 294/330] x86/xen: add central hypercall functions Add generic hypercall functions usable for all normal (i.e. not iret) hypercalls. Depending on the guest type and the processor vendor different functions need to be used due to the to be used instruction for entering the hypervisor: - PV guests need to use syscall - HVM/PVH guests on Intel need to use vmcall - HVM/PVH guests on AMD and Hygon need to use vmmcall As PVH guests need to issue hypercalls very early during boot, there is a 4th hypercall function needed for HVM/PVH which can be used on Intel and AMD processors. It will check the vendor type and then set the Intel or AMD specific function to use via static_call(). This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra --- arch/x86/include/asm/xen/hypercall.h | 3 + arch/x86/xen/enlighten.c | 65 ++++++++++++++++++++++ arch/x86/xen/enlighten_hvm.c | 4 ++ arch/x86/xen/enlighten_pv.c | 4 +- arch/x86/xen/xen-asm.S | 23 ++++++++ arch/x86/xen/xen-head.S | 83 ++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 9 +++ 7 files changed, 190 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a2dd24947eb85..6b4dd4de08a6d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -88,6 +88,9 @@ struct xen_dm_op_buf; extern struct { char _entry[32]; } hypercall_page[]; +void xen_hypercall_func(void); +DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); + #define __HYPERCALL "call hypercall_page+%c[offset]" #define __HYPERCALL_ENTRY(x) \ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 84e5adbd0925c..1887435af2fbf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); +DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); +EXPORT_STATIC_CALL_TRAMP(xen_hypercall); + /* * Pointer to the xen_vcpu_info structure or * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info @@ -68,6 +72,67 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +static __ref void xen_get_vendor(void) +{ + init_cpu_devs(); + cpu_detect(&boot_cpu_data); + get_cpu_vendor(&boot_cpu_data); +} + +void xen_hypercall_setfunc(void) +{ + if (static_call_query(xen_hypercall) != xen_hypercall_hvm) + return; + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + static_call_update(xen_hypercall, xen_hypercall_amd); + else + static_call_update(xen_hypercall, xen_hypercall_intel); +} + +/* + * Evaluate processor vendor in order to select the correct hypercall + * function for HVM/PVH guests. + * Might be called very early in boot before vendor has been set by + * early_cpu_init(). + */ +noinstr void *__xen_hypercall_setfunc(void) +{ + void (*func)(void); + + /* + * Xen is supported only on CPUs with CPUID, so testing for + * X86_FEATURE_CPUID is a test for early_cpu_init() having been + * run. + * + * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty + * dependency chain: it is being called via the xen_hypercall static + * call when running as a PVH or HVM guest. Hypercalls need to be + * noinstr due to PV guests using hypercalls in noinstr code. So we + * can safely tag the function body as "instrumentation ok", since + * the PV guest requirement is not of interest here (xen_get_vendor() + * calls noinstr functions, and static_call_update_early() might do + * so, too). + */ + instrumentation_begin(); + + if (!boot_cpu_has(X86_FEATURE_CPUID)) + xen_get_vendor(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + func = xen_hypercall_amd; + else + func = xen_hypercall_intel; + + static_call_update_early(xen_hypercall, func); + + instrumentation_end(); + + return func; +} + static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 24d2957a4726d..973a74fc966a4 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -300,6 +300,10 @@ static uint32_t __init xen_platform_hvm(void) if (xen_pv_domain()) return 0; + /* Set correct hypercall function. */ + if (xen_domain) + xen_hypercall_setfunc(); + if (xen_pvh_domain() && nopv) { /* Guest booting via the Xen-PVH boot entry goes here */ pr_info("\"nopv\" parameter is ignored in PVH guest\n"); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d6818c6cafda1..a8eb7e0c473cf 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1341,6 +1341,9 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; + /* Interrupts are guaranteed to be off initially. */ + early_boot_irqs_disabled = true; + static_call_update_early(xen_hypercall, xen_hypercall_pv); xen_setup_features(); @@ -1431,7 +1434,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); - early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index ca6edfe4c14b1..b518f36d1ca2e 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -20,9 +20,32 @@ #include #include +#include #include <../entry/calling.h> .pushsection .noinstr.text, "ax" +/* + * PV hypercall interface to the hypervisor. + * + * Called via inline asm(), so better preserve %rcx and %r11. + * + * Input: + * %eax: hypercall number + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %rax + */ +SYM_FUNC_START(xen_hypercall_pv) + ANNOTATE_NOENDBR + push %rcx + push %r11 + UNWIND_HINT_SAVE + syscall + UNWIND_HINT_RESTORE + pop %r11 + pop %rcx + RET +SYM_FUNC_END(xen_hypercall_pv) + /* * Disabling events is simply a matter of making the event mask * non-zero. diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f6c69dbb8165..c173ba6740e93 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -6,9 +6,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -87,6 +89,87 @@ SYM_CODE_END(xen_cpu_bringup_again) #endif #endif + .pushsection .noinstr.text, "ax" +/* + * Xen hypercall interface to the hypervisor. + * + * Input: + * %eax: hypercall number + * 32-bit: + * %ebx, %ecx, %edx, %esi, %edi: args 1..5 for the hypercall + * 64-bit: + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %[er]ax + */ +SYM_FUNC_START(xen_hypercall_hvm) + ENDBR + FRAME_BEGIN + /* Save all relevant registers (caller save and arguments). */ +#ifdef CONFIG_X86_32 + push %eax + push %ebx + push %ecx + push %edx + push %esi + push %edi +#else + push %rax + push %rcx + push %rdx + push %rdi + push %rsi + push %r11 + push %r10 + push %r9 + push %r8 +#ifdef CONFIG_FRAME_POINTER + pushq $0 /* Dummy push for stack alignment. */ +#endif +#endif + /* Set the vendor specific function. */ + call __xen_hypercall_setfunc + /* Set ZF = 1 if AMD, Restore saved registers. */ +#ifdef CONFIG_X86_32 + lea xen_hypercall_amd, %ebx + cmp %eax, %ebx + pop %edi + pop %esi + pop %edx + pop %ecx + pop %ebx + pop %eax +#else + lea xen_hypercall_amd(%rip), %rbx + cmp %rax, %rbx +#ifdef CONFIG_FRAME_POINTER + pop %rax /* Dummy pop. */ +#endif + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %rsi + pop %rdi + pop %rdx + pop %rcx + pop %rax +#endif + /* Use correct hypercall function. */ + jz xen_hypercall_amd + jmp xen_hypercall_intel +SYM_FUNC_END(xen_hypercall_hvm) + +SYM_FUNC_START(xen_hypercall_amd) + vmmcall + RET +SYM_FUNC_END(xen_hypercall_amd) + +SYM_FUNC_START(xen_hypercall_intel) + vmcall + RET +SYM_FUNC_END(xen_hypercall_intel) + .popsection + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e1b782e823e6b..63c13a2ccf556 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -326,4 +326,13 @@ static inline void xen_smp_intr_free_pv(unsigned int cpu) {} static inline void xen_smp_count_cpus(void) { } #endif /* CONFIG_SMP */ +#ifdef CONFIG_XEN_PV +void xen_hypercall_pv(void); +#endif +void xen_hypercall_hvm(void); +void xen_hypercall_amd(void); +void xen_hypercall_intel(void); +void xen_hypercall_setfunc(void); +void *__xen_hypercall_setfunc(void); + #endif /* XEN_OPS_H */ From b1c2cb86f4a7861480ad54bb9a58df3cbebf8e92 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 14:47:13 +0200 Subject: [PATCH 295/330] x86/xen: use new hypercall functions instead of hypercall page Call the Xen hypervisor via the new xen_hypercall_func static-call instead of the hypercall page. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/xen/hypercall.h | 33 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 6b4dd4de08a6d..7d5f8ad667741 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -39,9 +39,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -91,9 +93,17 @@ extern struct { char _entry[32]; } hypercall_page[]; void xen_hypercall_func(void); DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); -#define __HYPERCALL "call hypercall_page+%c[offset]" -#define __HYPERCALL_ENTRY(x) \ - [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) +#ifdef MODULE +#define __ADDRESSABLE_xen_hypercall +#else +#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall) +#endif + +#define __HYPERCALL \ + __ADDRESSABLE_xen_hypercall \ + "call __SCT__xen_hypercall" + +#define __HYPERCALL_ENTRY(x) "a" (x) #ifdef CONFIG_X86_32 #define __HYPERCALL_RETREG "eax" @@ -151,7 +161,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_0ARG(); \ asm volatile (__HYPERCALL \ : __HYPERCALL_0PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER0); \ (type)__res; \ }) @@ -162,7 +172,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_1ARG(a1); \ asm volatile (__HYPERCALL \ : __HYPERCALL_1PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER1); \ (type)__res; \ }) @@ -173,7 +183,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_2ARG(a1, a2); \ asm volatile (__HYPERCALL \ : __HYPERCALL_2PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER2); \ (type)__res; \ }) @@ -184,7 +194,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_3ARG(a1, a2, a3); \ asm volatile (__HYPERCALL \ : __HYPERCALL_3PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER3); \ (type)__res; \ }) @@ -195,7 +205,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_4ARG(a1, a2, a3, a4); \ asm volatile (__HYPERCALL \ : __HYPERCALL_4PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER4); \ (type)__res; \ }) @@ -209,12 +219,9 @@ xen_single_call(unsigned int call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); - if (call >= PAGE_SIZE / sizeof(hypercall_page[0])) - return -EINVAL; - - asm volatile(CALL_NOSPEC + asm volatile(__HYPERCALL : __HYPERCALL_5PARAM - : [thunk_target] "a" (&hypercall_page[call]) + : __HYPERCALL_ENTRY(call) : __HYPERCALL_CLOBBER5); return (long)__res; From 7fa0da5373685e7ed249af3fa317ab1e1ba8b0a6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 15:27:31 +0200 Subject: [PATCH 296/330] x86/xen: remove hypercall page The hypercall page is no longer needed. It can be removed, as from the Xen perspective it is optional. But, from Linux's perspective, it removes naked RET instructions that escape the speculative protections that Call Depth Tracking and/or Untrain Ret are trying to achieve. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Andrew Cooper Reviewed-by: Jan Beulich --- arch/x86/include/asm/xen/hypercall.h | 2 -- arch/x86/kernel/callthunks.c | 5 ----- arch/x86/kernel/vmlinux.lds.S | 4 ---- arch/x86/xen/enlighten.c | 2 -- arch/x86/xen/enlighten_hvm.c | 9 +-------- arch/x86/xen/enlighten_pvh.c | 7 ------- arch/x86/xen/xen-head.S | 24 ------------------------ 7 files changed, 1 insertion(+), 52 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7d5f8ad667741..97771b9d33af3 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -88,8 +88,6 @@ struct xen_dm_op_buf; * there aren't more than 5 arguments...) */ -extern struct { char _entry[32]; } hypercall_page[]; - void xen_hypercall_func(void); DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 4656474567533..f17d166078823 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -142,11 +142,6 @@ static bool skip_addr(void *dest) if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; -#endif -#ifdef CONFIG_XEN - if (dest >= (void *)hypercall_page && - dest < (void*)hypercall_page + PAGE_SIZE) - return true; #endif return false; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index fab3ac9a4574a..6a17396c8174e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -519,14 +519,10 @@ INIT_PER_CPU(irq_stack_backing_store); * linker will never mark as relocatable. (Using just ABSOLUTE() is not * sufficient for that). */ -#ifdef CONFIG_XEN #ifdef CONFIG_XEN_PV xen_elfnote_entry_value = ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); #endif -xen_elfnote_hypercall_page_value = - ABSOLUTE(xen_elfnote_hypercall_page) + ABSOLUTE(hypercall_page); -#endif #ifdef CONFIG_PVH xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1887435af2fbf..43dcd8c7badc0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -22,8 +22,6 @@ #include "xen-ops.h" -EXPORT_SYMBOL_GPL(hypercall_page); - DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); EXPORT_STATIC_CALL_TRAMP(xen_hypercall); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 973a74fc966a4..fe57ff85d004b 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -106,15 +106,8 @@ static void __init init_hvm_pv_info(void) /* PVH set up hypercall page in xen_prepare_pvh(). */ if (xen_pvh_domain()) pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - + else pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } xen_setup_features(); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bf68c329fc013..0e3d930bcb89e 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -129,17 +129,10 @@ static void __init pvh_arch_setup(void) void __init xen_pvh_init(struct boot_params *boot_params) { - u32 msr; - u64 pfn; - xen_pvh = 1; xen_domain_type = XEN_HVM_DOMAIN; xen_start_flags = pvh_start_info.flags; - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - x86_init.oem.arch_setup = pvh_arch_setup; x86_init.oem.banner = xen_banner; diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index c173ba6740e93..9252652afe596 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -22,28 +22,6 @@ #include #include -.pushsection .noinstr.text, "ax" - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - /* - * Xen will write the hypercall page, and sort out ENDBR. - */ - .skip 31, 0xcc - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -199,8 +177,6 @@ SYM_FUNC_END(xen_hypercall_intel) #else # define FEATURES_DOM0 0 #endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .globl xen_elfnote_hypercall_page; - xen_elfnote_hypercall_page: _ASM_PTR xen_elfnote_hypercall_page_value - .) ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") From 94901b7a74d82bfd30420f1d9d00898278fdc8bf Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 12 Dec 2024 22:00:15 +0900 Subject: [PATCH 297/330] rust: net::phy fix module autoloading The alias symbol name was renamed. Adjust module_phy_driver macro to create the proper symbol name to fix module autoloading. Fixes: 054a9cd395a7 ("modpost: rename alias symbol for MODULE_DEVICE_TABLE()") Signed-off-by: FUJITA Tomonori Link: https://patch.msgid.link/20241212130015.238863-1-fujita.tomonori@gmail.com Signed-off-by: Paolo Abeni --- rust/kernel/net/phy.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index b89c681d97c01..2fbfb6a94c11e 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -860,7 +860,7 @@ impl DeviceMask { /// ]; /// #[cfg(MODULE)] /// #[no_mangle] -/// static __mod_mdio__phydev_device_table: [::kernel::bindings::mdio_device_id; 2] = _DEVICE_TABLE; +/// static __mod_device_table__mdio__phydev: [::kernel::bindings::mdio_device_id; 2] = _DEVICE_TABLE; /// ``` #[macro_export] macro_rules! module_phy_driver { @@ -883,7 +883,7 @@ macro_rules! module_phy_driver { #[cfg(MODULE)] #[no_mangle] - static __mod_mdio__phydev_device_table: [$crate::bindings::mdio_device_id; + static __mod_device_table__mdio__phydev: [$crate::bindings::mdio_device_id; $crate::module_phy_driver!(@count_devices $($dev),+) + 1] = _DEVICE_TABLE; }; From 7d2f320e12744e5906a4fab40381060a81d22c12 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 13 Dec 2024 18:01:58 +0530 Subject: [PATCH 298/330] net: ethernet: oa_tc6: fix infinite loop error when tx credits becomes 0 SPI thread wakes up to perform SPI transfer whenever there is an TX skb from n/w stack or interrupt from MAC-PHY. Ethernet frame from TX skb is transferred based on the availability tx credits in the MAC-PHY which is reported from the previous SPI transfer. Sometimes there is a possibility that TX skb is available to transmit but there is no tx credits from MAC-PHY. In this case, there will not be any SPI transfer but the thread will be running in an endless loop until tx credits available again. So checking the availability of tx credits along with TX skb will prevent the above infinite loop. When the tx credits available again that will be notified through interrupt which will trigger the SPI transfer to get the available tx credits. Fixes: 53fbde8ab21e ("net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames") Reviewed-by: Jacob Keller Signed-off-by: Parthiban Veerasooran Signed-off-by: Paolo Abeni --- drivers/net/ethernet/oa_tc6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c index f9c0dcd965c2e..4c8b0ca922b72 100644 --- a/drivers/net/ethernet/oa_tc6.c +++ b/drivers/net/ethernet/oa_tc6.c @@ -1111,8 +1111,9 @@ static int oa_tc6_spi_thread_handler(void *data) /* This kthread will be waken up if there is a tx skb or mac-phy * interrupt to perform spi transfer with tx chunks. */ - wait_event_interruptible(tc6->spi_wq, tc6->waiting_tx_skb || - tc6->int_flag || + wait_event_interruptible(tc6->spi_wq, tc6->int_flag || + (tc6->waiting_tx_skb && + tc6->tx_credits) || kthread_should_stop()); if (kthread_should_stop()) From e592b5110b3e9393881b0a019d86832bbf71a47f Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 13 Dec 2024 18:01:59 +0530 Subject: [PATCH 299/330] net: ethernet: oa_tc6: fix tx skb race condition between reference pointers There are two skb pointers to manage tx skb's enqueued from n/w stack. waiting_tx_skb pointer points to the tx skb which needs to be processed and ongoing_tx_skb pointer points to the tx skb which is being processed. SPI thread prepares the tx data chunks from the tx skb pointed by the ongoing_tx_skb pointer. When the tx skb pointed by the ongoing_tx_skb is processed, the tx skb pointed by the waiting_tx_skb is assigned to ongoing_tx_skb and the waiting_tx_skb pointer is assigned with NULL. Whenever there is a new tx skb from n/w stack, it will be assigned to waiting_tx_skb pointer if it is NULL. Enqueuing and processing of a tx skb handled in two different threads. Consider a scenario where the SPI thread processed an ongoing_tx_skb and it moves next tx skb from waiting_tx_skb pointer to ongoing_tx_skb pointer without doing any NULL check. At this time, if the waiting_tx_skb pointer is NULL then ongoing_tx_skb pointer is also assigned with NULL. After that, if a new tx skb is assigned to waiting_tx_skb pointer by the n/w stack and there is a chance to overwrite the tx skb pointer with NULL in the SPI thread. Finally one of the tx skb will be left as unhandled, resulting packet missing and memory leak. - Consider the below scenario where the TXC reported from the previous transfer is 10 and ongoing_tx_skb holds an tx ethernet frame which can be transported in 20 TXCs and waiting_tx_skb is still NULL. tx_credits = 10; /* 21 are filled in the previous transfer */ ongoing_tx_skb = 20; waiting_tx_skb = NULL; /* Still NULL */ - So, (tc6->ongoing_tx_skb || tc6->waiting_tx_skb) becomes true. - After oa_tc6_prepare_spi_tx_buf_for_tx_skbs() ongoing_tx_skb = 10; waiting_tx_skb = NULL; /* Still NULL */ - Perform SPI transfer. - Process SPI rx buffer to get the TXC from footers. - Now let's assume previously filled 21 TXCs are freed so we are good to transport the next remaining 10 tx chunks from ongoing_tx_skb. tx_credits = 21; ongoing_tx_skb = 10; waiting_tx_skb = NULL; - So, (tc6->ongoing_tx_skb || tc6->waiting_tx_skb) becomes true again. - In the oa_tc6_prepare_spi_tx_buf_for_tx_skbs() ongoing_tx_skb = NULL; waiting_tx_skb = NULL; - Now the below bad case might happen, Thread1 (oa_tc6_start_xmit) Thread2 (oa_tc6_spi_thread_handler) --------------------------- ----------------------------------- - if waiting_tx_skb is NULL - if ongoing_tx_skb is NULL - ongoing_tx_skb = waiting_tx_skb - waiting_tx_skb = skb - waiting_tx_skb = NULL ... - ongoing_tx_skb = NULL - if waiting_tx_skb is NULL - waiting_tx_skb = skb To overcome the above issue, protect the moving of tx skb reference from waiting_tx_skb pointer to ongoing_tx_skb pointer and assigning new tx skb to waiting_tx_skb pointer, so that the other thread can't access the waiting_tx_skb pointer until the current thread completes moving the tx skb reference safely. Fixes: 53fbde8ab21e ("net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames") Signed-off-by: Parthiban Veerasooran Reviewed-by: Larysa Zaremba Signed-off-by: Paolo Abeni --- drivers/net/ethernet/oa_tc6.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c index 4c8b0ca922b72..db200e4ec284d 100644 --- a/drivers/net/ethernet/oa_tc6.c +++ b/drivers/net/ethernet/oa_tc6.c @@ -113,6 +113,7 @@ struct oa_tc6 { struct mii_bus *mdiobus; struct spi_device *spi; struct mutex spi_ctrl_lock; /* Protects spi control transfer */ + spinlock_t tx_skb_lock; /* Protects tx skb handling */ void *spi_ctrl_tx_buf; void *spi_ctrl_rx_buf; void *spi_data_tx_buf; @@ -1004,8 +1005,10 @@ static u16 oa_tc6_prepare_spi_tx_buf_for_tx_skbs(struct oa_tc6 *tc6) for (used_tx_credits = 0; used_tx_credits < tc6->tx_credits; used_tx_credits++) { if (!tc6->ongoing_tx_skb) { + spin_lock_bh(&tc6->tx_skb_lock); tc6->ongoing_tx_skb = tc6->waiting_tx_skb; tc6->waiting_tx_skb = NULL; + spin_unlock_bh(&tc6->tx_skb_lock); } if (!tc6->ongoing_tx_skb) break; @@ -1210,7 +1213,9 @@ netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb) return NETDEV_TX_OK; } + spin_lock_bh(&tc6->tx_skb_lock); tc6->waiting_tx_skb = skb; + spin_unlock_bh(&tc6->tx_skb_lock); /* Wake spi kthread to perform spi transfer */ wake_up_interruptible(&tc6->spi_wq); @@ -1240,6 +1245,7 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev) tc6->netdev = netdev; SET_NETDEV_DEV(netdev, &spi->dev); mutex_init(&tc6->spi_ctrl_lock); + spin_lock_init(&tc6->tx_skb_lock); /* Set the SPI controller to pump at realtime priority */ tc6->spi->rt = true; From 0cb2c504d79e7caa3abade3f466750c82ad26f01 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 14 Dec 2024 10:49:12 +0900 Subject: [PATCH 300/330] net: ethernet: bgmac-platform: fix an OF node reference leak The OF node obtained by of_parse_phandle() is not freed. Call of_node_put() to balance the refcount. This bug was found by an experimental static analysis tool that I am developing. Fixes: 1676aba5ef7e ("net: ethernet: bgmac: device tree phy enablement") Signed-off-by: Joe Hattori Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241214014912.2810315-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bgmac-platform.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index ecce23cecbea4..4e266ce41180b 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -171,6 +171,7 @@ static int platform_phy_connect(struct bgmac *bgmac) static int bgmac_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; + struct device_node *phy_node; struct bgmac *bgmac; struct resource *regs; int ret; @@ -236,7 +237,9 @@ static int bgmac_probe(struct platform_device *pdev) bgmac->cco_ctl_maskset = platform_bgmac_cco_ctl_maskset; bgmac->get_bus_clock = platform_bgmac_get_bus_clock; bgmac->cmn_maskset32 = platform_bgmac_cmn_maskset32; - if (of_parse_phandle(np, "phy-handle", 0)) { + phy_node = of_parse_phandle(np, "phy-handle", 0); + if (phy_node) { + of_node_put(phy_node); bgmac->phy_connect = platform_phy_connect; } else { bgmac->phy_connect = bgmac_phy_connect_direct; From 7ed2d91588779f0a2b27fd502ce2aaf1fab9b3ca Mon Sep 17 00:00:00 2001 From: Gianfranco Trad Date: Sun, 15 Dec 2024 02:17:34 +0100 Subject: [PATCH 301/330] qed: fix possible uninit pointer read in qed_mcp_nvm_info_populate() Coverity reports an uninit pointer read in qed_mcp_nvm_info_populate(). If EOPNOTSUPP is returned from qed_mcp_bist_nvm_get_num_images() ensure nvm_info.num_images is set to 0 to avoid possible uninit assignment to p_hwfn->nvm_info.image_att later on in out label. Closes: https://scan5.scan.coverity.com/#/project-view/63204/10063?selectedIssue=1636666 Suggested-by: Simon Horman Signed-off-by: Gianfranco Trad Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241215011733.351325-2-gianf.trad@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index b45efc272fdbb..c7f497c36f66c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -3358,6 +3358,7 @@ int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn) p_ptt, &nvm_info.num_images); if (rc == -EOPNOTSUPP) { DP_INFO(p_hwfn, "DRV_MSG_CODE_BIST_TEST is not supported\n"); + nvm_info.num_images = 0; goto out; } else if (rc || !nvm_info.num_images) { DP_ERR(p_hwfn, "Failed getting number of images\n"); From a6629626c584200daf495cc9a740048b455addcd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:19 -0500 Subject: [PATCH 302/330] tracing: Fix test_event_printk() to process entire print argument The test_event_printk() analyzes print formats of trace events looking for cases where it may dereference a pointer that is not in the ring buffer which can possibly be a bug when the trace event is read from the ring buffer and the content of that pointer no longer exists. The function needs to accurately go from one print format argument to the next. It handles quotes and parenthesis that may be included in an argument. When it finds the start of the next argument, it uses a simple "c = strstr(fmt + i, ',')" to find the end of that argument! In order to include "%s" dereferencing, it needs to process the entire content of the print format argument and not just the content of the first ',' it finds. As there may be content like: ({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char *access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" }; union kvm_mmu_page_role role; role.word = REC->role; trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe %sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level, role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "", access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? "" : "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ? "unsync" : "sync", 0); saved_ptr; }) Which is an example of a full argument of an existing event. As the code already handles finding the next print format argument, process the argument at the end of it and not the start of it. This way it has both the start of the argument as well as the end of it. Add a helper function "process_pointer()" that will do the processing during the loop as well as at the end. It also makes the code cleaner and easier to read. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 82 ++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 77e68efbd43e2..14e160a5b9054 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -265,8 +265,7 @@ static bool test_field(const char *fmt, struct trace_event_call *call) len = p - fmt; for (; field->type; field++) { - if (strncmp(field->name, fmt, len) || - field->name[len]) + if (strncmp(field->name, fmt, len) || field->name[len]) continue; array_descriptor = strchr(field->type, '['); /* This is an array and is OK to dereference. */ @@ -275,6 +274,32 @@ static bool test_field(const char *fmt, struct trace_event_call *call) return false; } +/* Return true if the argument pointer is safe */ +static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *a; + + e = fmt + len; + + /* Find the REC-> in the argument */ + r = strstr(fmt, "REC->"); + if (r && r < e) { + /* + * Addresses of events on the buffer, or an array on the buffer is + * OK to dereference. There's ways to fool this, but + * this is to catch common mistakes, not malicious code. + */ + a = strchr(fmt, '&'); + if ((a && (a < r)) || test_field(r, call)) + return true; + } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { + return true; + } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { + return true; + } + return false; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -285,12 +310,12 @@ static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; bool first = true; - const char *fmt, *c, *r, *a; + const char *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; - int i; + int i, e; fmt = call->print_fmt; @@ -403,42 +428,41 @@ static void test_event_printk(struct trace_event_call *call) case ',': if (in_quote || parens) continue; + e = i; i++; while (isspace(fmt[i])) i++; - start_arg = i; - if (!(dereference_flags & (1ULL << arg))) - goto next_arg; - /* Find the REC-> in the argument */ - c = strchr(fmt + i, ','); - r = strstr(fmt + i, "REC->"); - if (r && (!c || r < c)) { - /* - * Addresses of events on the buffer, - * or an array on the buffer is - * OK to dereference. - * There's ways to fool this, but - * this is to catch common mistakes, - * not malicious code. - */ - a = strchr(fmt + i, '&'); - if ((a && (a < r)) || test_field(r, call)) + /* + * If start_arg is zero, then this is the start of the + * first argument. The processing of the argument happens + * when the end of the argument is found, as it needs to + * handle paranthesis and such. + */ + if (!start_arg) { + start_arg = i; + /* Balance out the i++ in the for loop */ + i--; + continue; + } + + if (dereference_flags & (1ULL << arg)) { + if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); } - next_arg: - i--; + start_arg = i; arg++; + /* Balance out the i++ in the for loop */ + i--; } } + if (dereference_flags & (1ULL << arg)) { + if (process_pointer(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored From 917110481f6bc1c96b1e54b62bb114137fbc6d17 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:20 -0500 Subject: [PATCH 303/330] tracing: Add missing helper functions in event pointer dereference check The process_pointer() helper function looks to see if various trace event macros are used. These macros are for storing data in the event. This makes it safe to dereference as the dereference will then point into the event on the ring buffer where the content of the data stays with the event itself. A few helper functions were missing. Those were: __get_rel_dynamic_array() __get_dynamic_array_len() __get_rel_dynamic_array_len() __get_rel_sockaddr() Also add a helper function find_print_string() to not need to use a middle man variable to test if the string exists. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.521836792@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 14e160a5b9054..df75c06bb23fc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -274,6 +274,15 @@ static bool test_field(const char *fmt, struct trace_event_call *call) return false; } +/* Look for a string within an argument */ +static bool find_print_string(const char *arg, const char *str, const char *end) +{ + const char *r; + + r = strstr(arg, str); + return r && r < end; +} + /* Return true if the argument pointer is safe */ static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) { @@ -292,9 +301,17 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c a = strchr(fmt, '&'); if ((a && (a < r)) || test_field(r, call)) return true; - } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { + } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_sockaddr(", e)) { return true; - } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { + } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { return true; } return false; From 65a25d9f7ac02e0cf361356e834d1c71d36acca9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:21 -0500 Subject: [PATCH 304/330] tracing: Add "%s" check in test_event_printk() The test_event_printk() code makes sure that when a trace event is registered, any dereferenced pointers in from the event's TP_printk() are pointing to content in the ring buffer. But currently it does not handle "%s", as there's cases where the string pointer saved in the ring buffer points to a static string in the kernel that will never be freed. As that is a valid case, the pointer needs to be checked at runtime. Currently the runtime check is done via trace_check_vprintf(), but to not have to replicate everything in vsnprintf() it does some logic with the va_list that may not be reliable across architectures. In order to get rid of that logic, more work in the test_event_printk() needs to be done. Some of the strings can be validated at this time when it is obvious the string is valid because the string will be saved in the ring buffer content. Do all the validation of strings in the ring buffer at boot in test_event_printk(), and make sure that the field of the strings that point into the kernel are accessible. This will allow adding checks at runtime that will validate the fields themselves and not rely on paring the TP_printk() format at runtime. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.685917008@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 104 ++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df75c06bb23fc..521ad2fd1fe70 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,19 +244,16 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } -/* - * Check if the referenced field is an array and return true, - * as arrays are OK to dereference. - */ -static bool test_field(const char *fmt, struct trace_event_call *call) + +static struct trace_event_fields *find_event_field(const char *fmt, + struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; - const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) - return false; + return NULL; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') @@ -267,11 +264,26 @@ static bool test_field(const char *fmt, struct trace_event_call *call) for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue; - array_descriptor = strchr(field->type, '['); - /* This is an array and is OK to dereference. */ - return array_descriptor != NULL; + + return field; } - return false; + return NULL; +} + +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field; + + field = find_event_field(fmt, call); + if (!field) + return false; + + /* This is an array and is OK to dereference. */ + return strchr(field->type, '[') != NULL; } /* Look for a string within an argument */ @@ -317,6 +329,53 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c return false; } +/* Return true if the string is safe */ +static bool process_string(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *s; + + e = fmt + len; + + /* + * There are several helper functions that return strings. + * If the argument contains a function, then assume its field is valid. + * It is considered that the argument has a function if it has: + * alphanumeric or '_' before a parenthesis. + */ + s = fmt; + do { + r = strstr(s, "("); + if (!r || r >= e) + break; + for (int i = 1; r - i >= s; i++) { + char ch = *(r - i); + if (isspace(ch)) + continue; + if (isalnum(ch) || ch == '_') + return true; + /* Anything else, this isn't a function */ + break; + } + /* A function could be wrapped in parethesis, try the next one */ + s = r + 1; + } while (s < e); + + /* + * If there's any strings in the argument consider this arg OK as it + * could be: REC->field ? "foo" : "bar" and we don't want to get into + * verifying that logic here. + */ + if (find_print_string(fmt, "\"", e)) + return true; + + /* Dereferenced strings are also valid like any other pointer */ + if (process_pointer(fmt, len, call)) + return true; + + /* Make sure the field is found, and consider it OK for now if it is */ + return find_event_field(fmt, call) != NULL; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -326,6 +385,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; + u64 string_flags = 0; bool first = true; const char *fmt; int parens = 0; @@ -416,8 +476,16 @@ static void test_event_printk(struct trace_event_call *call) star = true; continue; } - if ((fmt[i + j] == 's') && star) - arg++; + if ((fmt[i + j] == 's')) { + if (star) + arg++; + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + string_flags |= 1ULL << arg; + } break; } break; @@ -464,7 +532,10 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (process_pointer(fmt + start_arg, e - start_arg, call)) + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); } @@ -476,7 +547,10 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (process_pointer(fmt + start_arg, i - start_arg, call)) + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, i - start_arg, call)) dereference_flags &= ~(1ULL << arg); } From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: [PATCH 305/330] tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +- kernel/trace/trace.c | 255 ++++++++--------------------------- kernel/trace/trace.h | 6 +- kernel/trace/trace_events.c | 32 +++-- kernel/trace/trace_output.c | 6 +- 5 files changed, 88 insertions(+), 217 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc7..91b8ffbdfa8c4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814d..7cc18b9bce271 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,17 +3611,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str, - bool star, int len) +static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; - /* Ignore strings with no length */ - if (star && !len) - return true; - /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3661,181 +3656,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static DEFINE_STATIC_KEY_FALSE(trace_no_verify); - -static int test_can_verify_check(const char *fmt, ...) -{ - char buf[16]; - va_list ap; - int ret; - - /* - * The verifier is dependent on vsnprintf() modifies the va_list - * passed to it, where it is sent as a reference. Some architectures - * (like x86_32) passes it by value, which means that vsnprintf() - * does not modify the va_list passed to it, and the verifier - * would then need to be able to understand all the values that - * vsnprintf can use. If it is passed by value, then the verifier - * is disabled. - */ - va_start(ap, fmt); - vsnprintf(buf, 16, "%d", ap); - ret = va_arg(ap, int); - va_end(ap); - - return ret; -} - -static void test_can_verify(void) -{ - if (!test_can_verify_check("%d %d", 0, 1)) { - pr_info("trace event string verifier disabled\n"); - static_branch_inc(&trace_no_verify); - } -} - /** - * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed - * @fmt: The format used to print the event - * @ap: The va_list holding the data to print from @fmt. * - * This writes the data into the @iter->seq buffer using the data from - * @fmt and @ap. If the format has a %s, then the source of the string - * is examined to make sure it is safe to print, otherwise it will - * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string - * pointer. + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. */ -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) +bool ignore_event(struct trace_iterator *iter) { - long text_delta = 0; - long data_delta = 0; - const char *p = fmt; - const char *str; - bool good; - int i, j; + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; - if (WARN_ON_ONCE(!fmt)) - return; + trace_event = ftrace_find_event(iter->ent->type); - if (static_branch_unlikely(&trace_no_verify)) - goto print; + seq = &iter->seq; - /* - * When the kernel is booted with the tp_printk command line - * parameter, trace events go directly through to printk(). - * It also is checked by this function, but it does not - * have an associated trace_array (tr) for it. - */ - if (iter->tr) { - text_delta = iter->tr->text_delta; - data_delta = iter->tr->data_delta; + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; } - /* Don't bother checking when doing a ftrace_dump() */ - if (iter->fmt == static_fmt_buf) - goto print; - - while (*p) { - bool star = false; - int len = 0; - - j = 0; - - /* - * We only care about %s and variants - * as well as %p[sS] if delta is non-zero - */ - for (i = 0; p[i]; i++) { - if (i + 1 >= iter->fmt_size) { - /* - * If we can't expand the copy buffer, - * just print it. - */ - if (!trace_iter_expand_format(iter)) - goto print; - } - - if (p[i] == '\\' && p[i+1]) { - i++; - continue; - } - if (p[i] == '%') { - /* Need to test cases like %08.*s */ - for (j = 1; p[i+j]; j++) { - if (isdigit(p[i+j]) || - p[i+j] == '.') - continue; - if (p[i+j] == '*') { - star = true; - continue; - } - break; - } - if (p[i+j] == 's') - break; - - if (text_delta && p[i+1] == 'p' && - ((p[i+2] == 's' || p[i+2] == 'S'))) - break; - - star = false; - } - j = 0; - } - /* If no %s found then just print normally */ - if (!p[i]) - break; - - /* Copy up to the %s, and print that */ - strncpy(iter->fmt, p, i); - iter->fmt[i] = '\0'; - trace_seq_vprintf(&iter->seq, iter->fmt, ap); + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; - /* Add delta to %pS pointers */ - if (p[i+1] == 'p') { - unsigned long addr; - char fmt[4]; + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } - fmt[0] = '%'; - fmt[1] = 'p'; - fmt[2] = p[i+2]; /* Either %ps or %pS */ - fmt[3] = '\0'; + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; - addr = va_arg(ap, unsigned long); - addr += text_delta; - trace_seq_printf(&iter->seq, fmt, (void *)addr); + list_for_each_entry(field, head, link) { + const char *str; + bool good; - p += i + 3; + if (!field->needs_test) continue; - } - /* - * If iter->seq is full, the above call no longer guarantees - * that ap is in sync with fmt processing, and further calls - * to va_arg() can return wrong positional arguments. - * - * Ensure that ap is no longer used in this case. - */ - if (iter->seq.full) { - p = ""; - break; - } - - if (star) - len = va_arg(ap, int); - - /* The ap now points to the string data of the %s */ - str = va_arg(ap, const char *); + str = *(const char **)(ptr + field->offset); - good = trace_safe_str(iter, str, star, len); - - /* Could be from the last boot */ - if (data_delta && !good) { - str += data_delta; - good = trace_safe_str(iter, str, star, len); - } + good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the @@ -3846,44 +3729,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", - fmt, seq_buf_str(&iter->seq.seq))) { - int ret; - - /* Try to safely read the string */ - if (star) { - if (len + 1 > iter->fmt_size) - len = iter->fmt_size - 1; - if (len < 0) - len = 0; - ret = copy_from_kernel_nofault(iter->fmt, str, len); - iter->fmt[len] = 0; - star = false; - } else { - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); - } - if (ret < 0) - trace_seq_printf(&iter->seq, "(0x%px)", str); - else - trace_seq_printf(&iter->seq, "(0x%px:%s)", - str, iter->fmt); - str = "[UNSAFE-MEMORY]"; - strcpy(iter->fmt, "%s"); - } else { - strncpy(iter->fmt, p + i, j + 1); - iter->fmt[j+1] = '\0'; + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; } - if (star) - trace_seq_printf(&iter->seq, iter->fmt, len, str); - else - trace_seq_printf(&iter->seq, iter->fmt, str); - - p += i + j + 1; } - print: - if (*p) - trace_seq_vprintf(&iter->seq, p, ap); + return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) @@ -10777,8 +10630,6 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); - test_can_verify(); - return 0; out_free_pipe_cpumask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 266740b4e1212..9691b47b5f3da 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,9 +667,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) __printf(2, 0); char *trace_iter_expand_format(struct trace_iterator *iter); +bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -1413,7 +1412,8 @@ struct ftrace_event_field { int filter_type; int offset; int size; - int is_signed; + unsigned int is_signed:1; + unsigned int needs_test:1; int len; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 521ad2fd1fe70..1545cc8b49d02 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system) } static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +__find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; @@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type, int len) + int is_signed, int filter_type, int len, + int need_test) { struct ftrace_event_field *field; @@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->needs_test = need_test; field->len = len; list_add(&field->link, head); @@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, 0); + is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, - int filter_type, int len) + int filter_type, int len, int need_test) { struct list_head *head; @@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, len); + is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type, 0); \ + filter_type, 0, 0); \ if (ret) \ return ret; @@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER, 0); \ + is_signed_type(type), FILTER_OTHER, \ + 0, 0); \ if (ret) \ return ret; @@ -332,6 +335,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c /* Return true if the string is safe */ static bool process_string(const char *fmt, int len, struct trace_event_call *call) { + struct trace_event_fields *field; const char *r, *e, *s; e = fmt + len; @@ -372,8 +376,16 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca if (process_pointer(fmt, len, call)) return true; - /* Make sure the field is found, and consider it OK for now if it is */ - return find_event_field(fmt, call) != NULL; + /* Make sure the field is found */ + field = find_event_field(fmt, call); + if (!field) + return false; + + /* Test this field's string before printing the event */ + call->flags |= TRACE_EVENT_FL_TEST_STR; + field->needs_test = 1; + + return true; } /* @@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_call *call) ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, - field->len); + field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index da748b7cbc4d5..03d56f711ad14 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) { + struct trace_seq *s = &iter->seq; va_list ap; + if (ignore_event(iter)) + return; + va_start(ap, fmt); - trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Nov 2024 11:11:55 -0500 Subject: [PATCH 306/330] btrfs: fix improper generation check in snapshot delete We have been using the following check if (generation <= root->root_key.offset) to make decisions about whether or not to visit a node during snapshot delete. This is because for normal subvolumes this is set to 0, and for snapshots it's set to the creation generation. The idea being that if the generation of the node is less than or equal to our creation generation then we don't need to visit that node, because it doesn't belong to us, we can simply drop our reference and move on. However reloc roots don't have their generation stored in root->root_key.offset, instead that is the objectid of their corresponding fs root. This means we can incorrectly not walk into nodes that need to be dropped when deleting a reloc root. There are a variety of consequences to making the wrong choice in two distinct areas. visit_node_for_delete() 1. False positive. We think we are newer than the block when we really aren't. We don't visit the node and drop our reference to the node and carry on. This would result in leaked space. 2. False negative. We do decide to walk down into a block that we should have just dropped our reference to. However this means that the child node will have refs > 1, so we will switch to UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice that btrfs_header_owner(node) != root->root_key.objectid and it'll break out of the loop, and then walk_up_proc() will drop our reference, so this appears to be ok. do_walk_down() 1. False positive. We are in UPDATE_BACKREF and incorrectly decide that we are done and don't need to update the backref for our lower nodes. This is another case that simply won't happen with relocation, as we only have to do UPDATE_BACKREF if the node below us was shared and didn't have FULL_BACKREF set, and since we don't own that node because we're a reloc root we actually won't end up in this case. 2. False negative. Again this is tricky because as described above, we simply wouldn't be here from relocation, because we don't own any of the nodes because we never set btrfs_header_owner() to the reloc root objectid, and we always use FULL_BACKREF, we never actually need to set FULL_BACKREF on any children. Having spent a lot of time stressing relocation/snapshot delete recently I've not seen this pop in practice. But this is objectively incorrect, so fix this to get the correct starting generation based on the root we're dropping to keep me from thinking there's a problem here. CC: stable@vger.kernel.org Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 +++++++++++++++++++ fs/btrfs/extent-tree.c | 6 +++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c706..2c341956a01ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -370,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi WRITE_ONCE(root->last_trans, transid); } +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 412e318e4a22d..43a771f7bd7af 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } From 6c3864e055486fadb5b97793b57688082e14b43b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:32 +0100 Subject: [PATCH 307/330] btrfs: use bio_is_zone_append() in the completion handler Otherwise it won't catch bios turned into regular writes by the block level zone write plugging. The additional test it adds is for emulated zone append. Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff65..011cc97be3b57 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -355,7 +355,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +398,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +412,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:33 +0100 Subject: [PATCH 308/330] btrfs: split bios to the fs sector size boundary Btrfs like other file systems can't really deal with I/O not aligned to it's internal block size (which strangely is called sector size in btrfs, for historical reasons), but the block layer split helper doesn't even know about that. Round down the split boundary so that all I/Os are aligned. Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 011cc97be3b57..78f5606baacb5 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 4 Dec 2024 13:30:46 +1030 Subject: [PATCH 309/330] btrfs: tree-checker: reject inline extent items with 0 ref count [BUG] There is a bug report in the mailing list where btrfs_run_delayed_refs() failed to drop the ref count for logical 25870311358464 num_bytes 2113536. The involved leaf dump looks like this: item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50 extent refs 1 gen 84178 flags 1 ref#0: shared data backref parent 32399126528000 count 0 <<< ref#1: shared data backref parent 31808973717504 count 1 Notice the count number is 0. [CAUSE] There is no concrete evidence yet, but considering 0 -> 1 is also a single bit flipped, it's possible that hardware memory bitflip is involved, causing the on-disk extent tree to be corrupted. [FIX] To prevent us reading such corrupted extent item, or writing such damaged extent item back to disk, enhance the handling of BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both inlined and key items, to detect such 0 ref count and reject them. CC: stable@vger.kernel.org # 5.4+ Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/ Reported-by: Frankie Fisher Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40ee..dfeee033f31fb 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } From 93433c1d919775f8ac0f7893692f42e6731a5373 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Mon, 25 Nov 2024 15:58:54 -0800 Subject: [PATCH 310/330] idpf: add support for SW triggered interrupts SW triggered interrupts are guaranteed to fire after their timer expires, unlike Tx and Rx interrupts which will only fire after the timer expires _and_ a descriptor write back is available to be processed by the driver. Add the necessary fields, defines, and initializations to enable a SW triggered interrupt in the vector's dyn_ctl register. Reviewed-by: Madhu Chittim Signed-off-by: Joshua Hay Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_dev.c | 3 +++ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 +++++++- drivers/net/ethernet/intel/idpf/idpf_vf_dev.c | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_dev.c index 6c913a703df64..41e4bd49402a8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_dev.c +++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c @@ -101,6 +101,9 @@ static int idpf_intr_reg_init(struct idpf_vport *vport) intr->dyn_ctl_itridx_s = PF_GLINT_DYN_CTL_ITR_INDX_S; intr->dyn_ctl_intrvl_s = PF_GLINT_DYN_CTL_INTERVAL_S; intr->dyn_ctl_wb_on_itr_m = PF_GLINT_DYN_CTL_WB_ON_ITR_M; + intr->dyn_ctl_swint_trig_m = PF_GLINT_DYN_CTL_SWINT_TRIG_M; + intr->dyn_ctl_sw_itridx_ena_m = + PF_GLINT_DYN_CTL_SW_ITR_INDX_ENA_M; spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing, IDPF_PF_ITR_IDX_SPACING); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 9c1fe84108ed2..0f71a6f5557b9 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -354,6 +354,8 @@ struct idpf_vec_regs { * @dyn_ctl_itridx_m: Mask for ITR index * @dyn_ctl_intrvl_s: Register bit offset for ITR interval * @dyn_ctl_wb_on_itr_m: Mask for WB on ITR feature + * @dyn_ctl_sw_itridx_ena_m: Mask for SW ITR index + * @dyn_ctl_swint_trig_m: Mask for dyn_ctl SW triggered interrupt enable * @rx_itr: RX ITR register * @tx_itr: TX ITR register * @icr_ena: Interrupt cause register offset @@ -367,6 +369,8 @@ struct idpf_intr_reg { u32 dyn_ctl_itridx_m; u32 dyn_ctl_intrvl_s; u32 dyn_ctl_wb_on_itr_m; + u32 dyn_ctl_sw_itridx_ena_m; + u32 dyn_ctl_swint_trig_m; void __iomem *rx_itr; void __iomem *tx_itr; void __iomem *icr_ena; @@ -437,7 +441,7 @@ struct idpf_q_vector { cpumask_var_t affinity_mask; __cacheline_group_end_aligned(cold); }; -libeth_cacheline_set_assert(struct idpf_q_vector, 112, +libeth_cacheline_set_assert(struct idpf_q_vector, 120, 24 + sizeof(struct napi_struct) + 2 * sizeof(struct dim), 8 + sizeof(cpumask_var_t)); @@ -471,6 +475,8 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IS_DYNAMIC(itr_mode) (itr_mode) #define IDPF_ITR_TX_DEF IDPF_ITR_20K #define IDPF_ITR_RX_DEF IDPF_ITR_20K +/* Index used for 'SW ITR' update in DYN_CTL register */ +#define IDPF_SW_ITR_UPDATE_IDX 2 /* Index used for 'No ITR' update in DYN_CTL register */ #define IDPF_NO_ITR_UPDATE_IDX 3 #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) diff --git a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c index aad62e270ae40..aba828abcb171 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c +++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c @@ -101,6 +101,9 @@ static int idpf_vf_intr_reg_init(struct idpf_vport *vport) intr->dyn_ctl_itridx_s = VF_INT_DYN_CTLN_ITR_INDX_S; intr->dyn_ctl_intrvl_s = VF_INT_DYN_CTLN_INTERVAL_S; intr->dyn_ctl_wb_on_itr_m = VF_INT_DYN_CTLN_WB_ON_ITR_M; + intr->dyn_ctl_swint_trig_m = VF_INT_DYN_CTLN_SWINT_TRIG_M; + intr->dyn_ctl_sw_itridx_ena_m = + VF_INT_DYN_CTLN_SW_ITR_INDX_ENA_M; spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing, IDPF_VF_ITR_IDX_SPACING); From 0c1683c681681c14f4389e3bfa8de10baf242ba8 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Mon, 25 Nov 2024 15:58:55 -0800 Subject: [PATCH 311/330] idpf: trigger SW interrupt when exiting wb_on_itr mode There is a race condition between exiting wb_on_itr and completion write backs. For example, we are in wb_on_itr mode and a Tx completion is generated by HW, ready to be written back, as we are re-enabling interrupts: HW SW | | | | idpf_tx_splitq_clean_all | | napi_complete_done | | | tx_completion_wb | idpf_vport_intr_update_itr_ena_irq That tx_completion_wb happens before the vector is fully re-enabled. Continuing with this example, it is a UDP stream and the tx_completion_wb is the last one in the flow (there are no rx packets). Because the HW generated the completion before the interrupt is fully enabled, the HW will not fire the interrupt once the timer expires and the write back will not happen. NAPI poll won't be called. We have indicated we're back in interrupt mode but nothing else will trigger the interrupt. Therefore, the completion goes unprocessed, triggering a Tx timeout. To mitigate this, fire a SW triggered interrupt upon exiting wb_on_itr. This interrupt will catch the rogue completion and avoid the timeout. Add logic to set the appropriate bits in the vector's dyn_ctl register. Fixes: 9c4a27da0ecc ("idpf: enable WB_ON_ITR") Reviewed-by: Madhu Chittim Signed-off-by: Joshua Hay Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 29 ++++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 34f4118c7bc0d..2fa9c36e33c9c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3604,21 +3604,31 @@ static void idpf_vport_intr_dis_irq_all(struct idpf_vport *vport) /** * idpf_vport_intr_buildreg_itr - Enable default interrupt generation settings * @q_vector: pointer to q_vector - * @type: itr index - * @itr: itr value */ -static u32 idpf_vport_intr_buildreg_itr(struct idpf_q_vector *q_vector, - const int type, u16 itr) +static u32 idpf_vport_intr_buildreg_itr(struct idpf_q_vector *q_vector) { - u32 itr_val; + u32 itr_val = q_vector->intr_reg.dyn_ctl_intena_m; + int type = IDPF_NO_ITR_UPDATE_IDX; + u16 itr = 0; + + if (q_vector->wb_on_itr) { + /* + * Trigger a software interrupt when exiting wb_on_itr, to make + * sure we catch any pending write backs that might have been + * missed due to interrupt state transition. + */ + itr_val |= q_vector->intr_reg.dyn_ctl_swint_trig_m | + q_vector->intr_reg.dyn_ctl_sw_itridx_ena_m; + type = IDPF_SW_ITR_UPDATE_IDX; + itr = IDPF_ITR_20K; + } itr &= IDPF_ITR_MASK; /* Don't clear PBA because that can cause lost interrupts that * came in while we were cleaning/polling */ - itr_val = q_vector->intr_reg.dyn_ctl_intena_m | - (type << q_vector->intr_reg.dyn_ctl_itridx_s) | - (itr << (q_vector->intr_reg.dyn_ctl_intrvl_s - 1)); + itr_val |= (type << q_vector->intr_reg.dyn_ctl_itridx_s) | + (itr << (q_vector->intr_reg.dyn_ctl_intrvl_s - 1)); return itr_val; } @@ -3716,9 +3726,8 @@ void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector) /* net_dim() updates ITR out-of-band using a work item */ idpf_net_dim(q_vector); + intval = idpf_vport_intr_buildreg_itr(q_vector); q_vector->wb_on_itr = false; - intval = idpf_vport_intr_buildreg_itr(q_vector, - IDPF_NO_ITR_UPDATE_IDX, 0); writel(intval, q_vector->intr_reg.dyn_ctl); } From aef25be35d23ec768eed08bfcf7ca3cf9685bc28 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 21 Nov 2024 11:22:18 -0700 Subject: [PATCH 312/330] hexagon: Disable constant extender optimization for LLVM prior to 19.1.0 The Hexagon-specific constant extender optimization in LLVM may crash on Linux kernel code [1], such as fs/bcache/btree_io.c after commit 32ed4a620c54 ("bcachefs: Btree path tracepoints") in 6.12: clang: llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp:745: bool (anonymous namespace)::HexagonConstExtenders::ExtRoot::operator<(const HCE::ExtRoot &) const: Assertion `ThisB->getParent() == OtherB->getParent()' failed. Stack dump: 0. Program arguments: clang --target=hexagon-linux-musl ... fs/bcachefs/btree_io.c 1. parser at end of file 2. Code generation 3. Running pass 'Function Pass Manager' on module 'fs/bcachefs/btree_io.c'. 4. Running pass 'Hexagon constant-extender optimization' on function '@__btree_node_lock_nopath' Without assertions enabled, there is just a hang during compilation. This has been resolved in LLVM main (20.0.0) [2] and backported to LLVM 19.1.0 but the kernel supports LLVM 13.0.1 and newer, so disable the constant expander optimization using the '-mllvm' option when using a toolchain that is not fixed. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/issues/99714 [1] Link: https://github.com/llvm/llvm-project/commit/68df06a0b2998765cb0a41353fcf0919bbf57ddb [2] Link: https://github.com/llvm/llvm-project/commit/2ab8d93061581edad3501561722ebd5632d73892 [3] Reviewed-by: Brian Cain Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- arch/hexagon/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index 92d005958dfb2..ff172cbe5881a 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -32,3 +32,9 @@ KBUILD_LDFLAGS += $(ldflags-y) TIR_NAME := r19 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__ KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME) + +# Disable HexagonConstExtenders pass for LLVM versions prior to 19.1.0 +# https://github.com/llvm/llvm-project/issues/99714 +ifneq ($(call clang-min-version, 190100),y) +KBUILD_CFLAGS += -mllvm -hexagon-cext=false +endif From b9b8301d369b4c876de5255dbf067b19ba88ac71 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Dec 2024 08:37:03 +0000 Subject: [PATCH 313/330] net: netdevsim: fix nsim_pp_hold_write() nsim_pp_hold_write() has two problems: 1) It may return with rtnl held, as found by syzbot. 2) Its return value does not propagate an error if any. Fixes: 1580cbcbfe77 ("net: netdevsim: add some fake page pool use") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216083703.1859921-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 0be47fed4efc5..e068a9761c094 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -635,10 +635,10 @@ nsim_pp_hold_write(struct file *file, const char __user *data, page_pool_put_full_page(ns->page->pp, ns->page, false); ns->page = NULL; } - rtnl_unlock(); exit: - return count; + rtnl_unlock(); + return ret; } static const struct file_operations nsim_pp_hold_fops = { From 954a2b40719a21e763a1bba2f0da92347e058fce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Dec 2024 20:04:32 +0900 Subject: [PATCH 314/330] rtnetlink: Try the outer netns attribute in rtnl_get_peer_net(). Xiao Liang reported that the cited commit changed netns handling in newlink() of netkit, veth, and vxcan. Before the patch, if we don't find a netns attribute in the peer device attributes, we tried to find another netns attribute in the outer netlink attributes by passing it to rtnl_link_get_net(). Let's restore the original behaviour. Fixes: 48327566769a ("rtnetlink: fix double call of rtnl_link_get_net_ifla()") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCORBVVU8P6AHcEkENMj+gD2d3ce9t=A_o48E0yOQp8_wUQ@mail.gmail.com/#t Signed-off-by: Kuniyuki Iwashima Tested-by: Xiao Liang Link: https://patch.msgid.link/20241216110432.51488-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ebcfc2debf1a3..d9f959c619d95 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3819,6 +3819,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -3826,7 +3827,7 @@ static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, int err; if (!data || !data[ops->peer_type]) - return NULL; + return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) @@ -3971,7 +3972,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - peer_net = rtnl_get_peer_net(ops, data, extack); + peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; From fca2977629f49dee437e217c3fc423b6e0cad98c Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 7 Oct 2024 10:23:58 +0200 Subject: [PATCH 315/330] can: m_can: set init flag earlier in probe While an m_can controller usually already has the init flag from a hardware reset, no such reset happens on the integrated m_can_pci of the Intel Elkhart Lake. If the CAN controller is found in an active state, m_can_dev_setup() would fail because m_can_niso_supported() calls m_can_cccr_update_bits(), which refuses to modify any other configuration bits when CCCR_INIT is not set. To avoid this issue, set CCCR_INIT before attempting to modify any other configuration flags. Fixes: cd5a46ce6fa6 ("can: m_can: don't enable transceiver when probing") Signed-off-by: Matthias Schiffer Reviewed-by: Markus Schneider-Pargmann Link: https://patch.msgid.link/e247f331cb72829fcbdfda74f31a59cbad1a6006.1728288535.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 533bcb77c9f93..67c404fbe1667 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1695,6 +1695,14 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) return -EINVAL; } + /* Write the INIT bit, in case no hardware reset has happened before + * the probe (for example, it was observed that the Intel Elkhart Lake + * SoCs do not properly reset the CAN controllers on reboot) + */ + err = m_can_cccr_update_bits(cdev, CCCR_INIT, CCCR_INIT); + if (err) + return err; + if (!cdev->is_peripheral) netif_napi_add(dev, &cdev->napi, m_can_poll); @@ -1746,11 +1754,7 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) return -EINVAL; } - /* Forcing standby mode should be redundant, as the chip should be in - * standby after a reset. Write the INIT bit anyways, should the chip - * be configured by previous stage. - */ - return m_can_cccr_update_bits(cdev, CCCR_INIT, CCCR_INIT); + return 0; } static void m_can_stop(struct net_device *dev) From 743375f8deee360b0e902074bab99b0c9368d42f Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 7 Oct 2024 10:23:59 +0200 Subject: [PATCH 316/330] can: m_can: fix missed interrupts with m_can_pci The interrupt line of PCI devices is interpreted as edge-triggered, however the interrupt signal of the m_can controller integrated in Intel Elkhart Lake CPUs appears to be generated level-triggered. Consider the following sequence of events: - IR register is read, interrupt X is set - A new interrupt Y is triggered in the m_can controller - IR register is written to acknowledge interrupt X. Y remains set in IR As at no point in this sequence no interrupt flag is set in IR, the m_can interrupt line will never become deasserted, and no edge will ever be observed to trigger another run of the ISR. This was observed to result in the TX queue of the EHL m_can to get stuck under high load, because frames were queued to the hardware in m_can_start_xmit(), but m_can_finish_tx() was never run to account for their successful transmission. On an Elkhart Lake based board with the two CAN interfaces connected to each other, the following script can reproduce the issue: ip link set can0 up type can bitrate 1000000 ip link set can1 up type can bitrate 1000000 cangen can0 -g 2 -I 000 -L 8 & cangen can0 -g 2 -I 001 -L 8 & cangen can0 -g 2 -I 002 -L 8 & cangen can0 -g 2 -I 003 -L 8 & cangen can0 -g 2 -I 004 -L 8 & cangen can0 -g 2 -I 005 -L 8 & cangen can0 -g 2 -I 006 -L 8 & cangen can0 -g 2 -I 007 -L 8 & cangen can1 -g 2 -I 100 -L 8 & cangen can1 -g 2 -I 101 -L 8 & cangen can1 -g 2 -I 102 -L 8 & cangen can1 -g 2 -I 103 -L 8 & cangen can1 -g 2 -I 104 -L 8 & cangen can1 -g 2 -I 105 -L 8 & cangen can1 -g 2 -I 106 -L 8 & cangen can1 -g 2 -I 107 -L 8 & stress-ng --matrix 0 & To fix the issue, repeatedly read and acknowledge interrupts at the start of the ISR until no interrupt flags are set, so the next incoming interrupt will also result in an edge on the interrupt line. While we have received a report that even with this patch, the TX queue can become stuck under certain (currently unknown) circumstances on the Elkhart Lake, this patch completely fixes the issue with the above reproducer, and it is unclear whether the remaining issue has a similar cause at all. Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake") Signed-off-by: Matthias Schiffer Reviewed-by: Markus Schneider-Pargmann Link: https://patch.msgid.link/fdf0439c51bcb3a46c21e9fb21c7f1d06363be84.1728288535.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 22 +++++++++++++++++----- drivers/net/can/m_can/m_can.h | 1 + drivers/net/can/m_can/m_can_pci.c | 1 + 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 67c404fbe1667..97cd8bbf2e32a 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1220,20 +1220,32 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir) static int m_can_interrupt_handler(struct m_can_classdev *cdev) { struct net_device *dev = cdev->net; - u32 ir; + u32 ir = 0, ir_read; int ret; if (pm_runtime_suspended(cdev->dev)) return IRQ_NONE; - ir = m_can_read(cdev, M_CAN_IR); + /* The m_can controller signals its interrupt status as a level, but + * depending in the integration the CPU may interpret the signal as + * edge-triggered (for example with m_can_pci). For these + * edge-triggered integrations, we must observe that IR is 0 at least + * once to be sure that the next interrupt will generate an edge. + */ + while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) { + ir |= ir_read; + + /* ACK all irqs */ + m_can_write(cdev, M_CAN_IR, ir); + + if (!cdev->irq_edge_triggered) + break; + } + m_can_coalescing_update(cdev, ir); if (!ir) return IRQ_NONE; - /* ACK all irqs */ - m_can_write(cdev, M_CAN_IR, ir); - if (cdev->ops->clear_interrupts) cdev->ops->clear_interrupts(cdev); diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h index 92b2bd8628e6b..ef39e8e527ab6 100644 --- a/drivers/net/can/m_can/m_can.h +++ b/drivers/net/can/m_can/m_can.h @@ -99,6 +99,7 @@ struct m_can_classdev { int pm_clock_support; int pm_wake_source; int is_peripheral; + bool irq_edge_triggered; // Cached M_CAN_IE register content u32 active_interrupts; diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c index d72fe771dfc7a..9ad7419f88f83 100644 --- a/drivers/net/can/m_can/m_can_pci.c +++ b/drivers/net/can/m_can/m_can_pci.c @@ -127,6 +127,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) mcan_class->pm_clock_support = 1; mcan_class->pm_wake_source = 0; mcan_class->can.clock.freq = id->driver_data; + mcan_class->irq_edge_triggered = true; mcan_class->ops = &m_can_pci_ops; pci_set_drvdata(pci, mcan_class); From edc19bd0e571c732cd01c8da62f904e6d2a29a48 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Tue, 17 Dec 2024 16:00:21 +0100 Subject: [PATCH 317/330] pwm: stm32: Fix complementary output in round_waveform_tohw() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the timer supports complementary output, the CCxNE bit must be set additionally to the CCxE bit. So to not overwrite the latter use |= instead of = to set the former. Fixes: deaba9cff809 ("pwm: stm32: Implementation of the waveform callbacks") Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20241217150021.2030213-1-fabrice.gasnier@foss.st.com [ukleinek: Slightly improve commit log] Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index b889e64522c3d..17e591f61efb6 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -84,7 +84,7 @@ static int stm32_pwm_round_waveform_tohw(struct pwm_chip *chip, wfhw->ccer = TIM_CCER_CCxE(ch + 1); if (priv->have_complementary_output) - wfhw->ccer = TIM_CCER_CCxNE(ch + 1); + wfhw->ccer |= TIM_CCER_CCxNE(ch + 1); rate = clk_get_rate(priv->clk); From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: [PATCH 318/330] x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af89729..78a77a4ae0ea8 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) From cf2c97423a4f89c8b798294d3f34ecfe7e7035c3 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sat, 14 Dec 2024 17:30:53 +0000 Subject: [PATCH 319/330] ipvs: Fix clamp() of ip_vs_conn_tab on small memory systems The 'max_avail' value is calculated from the system memory size using order_base_2(). order_base_2(x) is defined as '(x) ? fn(x) : 0'. The compiler generates two copies of the code that follows and then expands clamp(max, min, PAGE_SHIFT - 12) (11 on 32bit). This triggers a compile-time assert since min is 5. In reality a system would have to have less than 512MB memory for the bounds passed to clamp to be reversed. Swap the order of the arguments to clamp() to avoid the warning. Replace the clamp_val() on the line below with clamp(). clamp_val() is just 'an accident waiting to happen' and not needed here. Detected by compile time checks added to clamp(), specifically: minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp() Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/all/CA+G9fYsT34UkGFKxus63H6UVpYi5GRZkezT9MRLfAbM3f6ke0g@mail.gmail.com/ Fixes: 4f325e26277b ("ipvs: dynamically limit the connection hash table") Tested-by: Bartosz Golaszewski Reviewed-by: Bartosz Golaszewski Signed-off-by: David Laight Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 98d7dbe3d7876..c0289f83f96df 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1495,8 +1495,8 @@ int __init ip_vs_conn_init(void) max_avail -= 2; /* ~4 in hash row */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); - max = clamp(max, min, max_avail); - ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; From 70b6f46a4ed8bd56c85ffff22df91e20e8c85e33 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 17 Dec 2024 20:56:55 +0100 Subject: [PATCH 320/330] netfilter: ipset: Fix for recursive locking warning With CONFIG_PROVE_LOCKING, when creating a set of type bitmap:ip, adding it to a set of type list:set and populating it from iptables SET target triggers a kernel warning: | WARNING: possible recursive locking detected | 6.12.0-rc7-01692-g5e9a28f41134-dirty #594 Not tainted | -------------------------------------------- | ping/4018 is trying to acquire lock: | ffff8881094a6848 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] | | but task is already holding lock: | ffff88811034c048 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] This is a false alarm: ipset does not allow nested list:set type, so the loop in list_set_kadd() can never encounter the outer set itself. No other set type supports embedded sets, so this is the only case to consider. To avoid the false report, create a distinct lock class for list:set type ipset locks. Fixes: f830837f0eed ("netfilter: ipset: list:set set type support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bfae7066936bb..db794fe1300e6 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -611,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -627,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); From cff865c700711ecc3824b2dfe181637f3ed23c80 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Dec 2024 09:10:34 +0100 Subject: [PATCH 321/330] net: phy: avoid undefined behavior in *_led_polarity_set() gcc runs into undefined behavior at the end of the three led_polarity_set() callback functions if it were called with a zero 'modes' argument and it just ends the function there without returning from it. This gets flagged by 'objtool' as a function that continues on to the next one: drivers/net/phy/aquantia/aquantia_leds.o: warning: objtool: aqr_phy_led_polarity_set+0xf: can't find jump dest instruction at .text+0x5d9 drivers/net/phy/intel-xway.o: warning: objtool: xway_gphy_led_polarity_set() falls through to next function xway_gphy_config_init() drivers/net/phy/mxl-gpy.o: warning: objtool: gpy_led_polarity_set() falls through to next function gpy_led_hw_control_get() There is no point to micro-optimize the behavior here to save a single-digit number of bytes in the kernel, so just change this to a "return -EINVAL" as we do when any unexpected bits are set. Fixes: 1758af47b98c ("net: phy: intel-xway: add support for PHY LEDs") Fixes: 9d55e68b19f2 ("net: phy: aquantia: correctly describe LED polarity override") Fixes: eb89c79c1b8f ("net: phy: mxl-gpy: correctly describe LED polarity") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241217081056.238792-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/phy/aquantia/aquantia_leds.c | 2 +- drivers/net/phy/intel-xway.c | 2 +- drivers/net/phy/mxl-gpy.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/aquantia/aquantia_leds.c b/drivers/net/phy/aquantia/aquantia_leds.c index 00ad2313fed36..951f46104eff0 100644 --- a/drivers/net/phy/aquantia/aquantia_leds.c +++ b/drivers/net/phy/aquantia/aquantia_leds.c @@ -156,5 +156,5 @@ int aqr_phy_led_polarity_set(struct phy_device *phydev, int index, unsigned long if (force_active_high || force_active_low) return aqr_phy_led_active_low_set(phydev, index, force_active_low); - unreachable(); + return -EINVAL; } diff --git a/drivers/net/phy/intel-xway.c b/drivers/net/phy/intel-xway.c index b672c55a7a4e7..e6ed2413e514c 100644 --- a/drivers/net/phy/intel-xway.c +++ b/drivers/net/phy/intel-xway.c @@ -529,7 +529,7 @@ static int xway_gphy_led_polarity_set(struct phy_device *phydev, int index, if (force_active_high) return phy_clear_bits(phydev, XWAY_MDIO_LED, XWAY_GPHY_LED_INV(index)); - unreachable(); + return -EINVAL; } static struct phy_driver xway_gphy[] = { diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index db3c1f72b4073..a8ccf257c1094 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -1014,7 +1014,7 @@ static int gpy_led_polarity_set(struct phy_device *phydev, int index, if (force_active_high) return phy_clear_bits(phydev, PHY_LED, PHY_LED_POLARITY(index)); - unreachable(); + return -EINVAL; } static struct phy_driver gpy_drivers[] = { From 5c964c8a97c12145104f5d2782aa1ffccf3a93dd Mon Sep 17 00:00:00 2001 From: Martin Hou Date: Mon, 16 Dec 2024 11:06:18 +0800 Subject: [PATCH 322/330] net: usb: qmi_wwan: add Quectel RG255C Add support for Quectel RG255C which is based on Qualcomm SDX35 chip. The composition is DM / NMEA / AT / QMI. T: Bus=01 Lev=01 Prnt=01 Port=04 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0316 Rev= 5.15 S: Manufacturer=Quectel S: Product=RG255C-CN S: SerialNumber=c68192c1 C:* #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Martin Hou Link: https://patch.msgid.link/tencent_17DDD787B48E8A5AB8379ED69E23A0CD9309@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 9fe7f704a2f7b..e9208a8d2bfa6 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1429,6 +1429,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x030e, 4)}, /* Quectel EM05GV2 */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0316, 3)}, /* Quectel RG255C */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0112, 0)}, /* Fibocom FG132 */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ From a17975992cc11588767175247ccaae1213a8b582 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 22:16:51 +0100 Subject: [PATCH 323/330] selftests: openvswitch: fix tcpdump execution Fix the way tcpdump is executed by: - Using the right variable for the namespace. Currently the use of the empty "ns" makes the command fail. - Waiting until it starts to capture to ensure the interesting traffic is caught on slow systems. - Using line-buffered output to ensure logs are available when the test is paused with "-p". Otherwise the last chunk of data might only be written when tcpdump is killed. Fixes: 74cc26f416b9 ("selftests: openvswitch: add interface support") Signed-off-by: Adrian Moreno Acked-by: Eelco Chaudron Link: https://patch.msgid.link/20241217211652.483016-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/openvswitch.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index cc0bfae2bafa1..960e1ab4dd04b 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -171,8 +171,10 @@ ovs_add_netns_and_veths () { ovs_add_if "$1" "$2" "$4" -u || return 1 fi - [ $TRACING -eq 1 ] && ovs_netns_spawn_daemon "$1" "$ns" \ - tcpdump -i any -s 65535 + if [ $TRACING -eq 1 ]; then + ovs_netns_spawn_daemon "$1" "$3" tcpdump -l -i any -s 6553 + ovs_wait grep -q "listening on any" ${ovs_dir}/stderr + fi return 0 } From 16f027cd40eeedd2325f7e720689462ca8d9d13e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 16 Dec 2024 15:50:59 +0200 Subject: [PATCH 324/330] net: dsa: restore dsa_software_vlan_untag() ability to operate on VLAN-untagged traffic Robert Hodaszi reports that locally terminated traffic towards VLAN-unaware bridge ports is broken with ocelot-8021q. He is describing the same symptoms as for commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). For context, the set merged as "VLAN fixes for Ocelot driver": https://lore.kernel.org/netdev/20240815000707.2006121-1-vladimir.oltean@nxp.com/ was developed in a slightly different form earlier this year, in January. Initially, the switch was unconditionally configured to set OCELOT_ES0_TAG when using ocelot-8021q, regardless of port operating mode. This led to the situation where VLAN-unaware bridge ports would always push their PVID - see ocelot_vlan_unaware_pvid() - a negligible value anyway - into RX packets. To strip this in software, we would have needed DSA to know what private VID the switch chose for VLAN-unaware bridge ports, and pushed into the packets. This was implemented downstream, and a remnant of it remains in the form of a comment mentioning ds->ops->get_private_vid(), as something which would maybe need to be considered in the future. However, for upstream, it was deemed inappropriate, because it would mean introducing yet another behavior for stripping VLAN tags from VLAN-unaware bridge ports, when one already existed (ds->untag_bridge_pvid). The latter has been marked as obsolete along with an explanation why it is logically broken, but still, it would have been confusing. So, for upstream, felix_update_tag_8021q_rx_rule() was developed, which essentially changed the state of affairs from "Felix with ocelot-8021q delivers all packets as VLAN-tagged towards the CPU" into "Felix with ocelot-8021q delivers all packets from VLAN-aware bridge ports towards the CPU". This was done on the premise that in VLAN-unaware mode, there's nothing useful in the VLAN tags, and we can avoid introducing ds->ops->get_private_vid() in the DSA receive path if we configure the switch to not push those VLAN tags into packets in the first place. Unfortunately, and this is when the trainwreck started, the selftests developed initially and posted with the series were not re-ran. dsa_software_vlan_untag() was initially written given the assumption that users of this feature would send _all_ traffic as VLAN-tagged. It was only partially adapted to the new scheme, by removing ds->ops->get_private_vid(), which also used to be necessary in standalone ports mode. Where the trainwreck became even worse is that I had a second opportunity to think about this, when the dsa_software_vlan_untag() logic change initially broke sja1105, in commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). I did not connect the dots that it also breaks ocelot-8021q, for pretty much the same reason that not all received packets will be VLAN-tagged. To be compatible with the optimized Felix control path which runs felix_update_tag_8021q_rx_rule() to only push VLAN tags when useful (in VLAN-aware mode), we need to restore the old dsa_software_vlan_untag() logic. The blamed commit introduced the assumption that dsa_software_vlan_untag() will see only VLAN-tagged packets, assumption which is false. What corrupts RX traffic is the fact that we call skb_vlan_untag() on packets which are not VLAN-tagged in the first place. Fixes: 93e4649efa96 ("net: dsa: provide a software untagging function on RX for VLAN-aware bridges") Reported-by: Robert Hodaszi Closes: https://lore.kernel.org/netdev/20241215163334.615427-1-robert.hodaszi@digi.com/ Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241216135059.1258266-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/dsa/tag.h b/net/dsa/tag.h index d5707870906bc..5d80ddad4ff6b 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -138,9 +138,10 @@ static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path * @skb: Pointer to socket buffer (packet) * - * Receive path method for switches which cannot avoid tagging all packets - * towards the CPU port. Called when ds->untag_bridge_pvid (legacy) or - * ds->untag_vlan_aware_bridge_pvid is set to true. + * Receive path method for switches which send some packets as VLAN-tagged + * towards the CPU port (generally from VLAN-aware bridge ports) even when the + * packet was not tagged on the wire. Called when ds->untag_bridge_pvid + * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true. * * As a side effect of this method, any VLAN tag from the skb head is moved * to hwaccel. @@ -149,14 +150,19 @@ static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); struct net_device *br = dsa_port_bridge_dev_get(dp); - u16 vid; + u16 vid, proto; + int err; /* software untagging for standalone ports not yet necessary */ if (!br) return skb; + err = br_vlan_get_proto(br, &proto); + if (err) + return skb; + /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb)) { + if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { skb = skb_vlan_untag(skb); if (!skb) return NULL; From 5eb70dbebf32c2fd1f2814c654ae17fc47d6e859 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2024 18:25:08 -0800 Subject: [PATCH 325/330] netdev-genl: avoid empty messages in queue dump Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down it has no queues. Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reported-by: syzbot+0a884bc2d304ce4af70f@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241218022508.815344-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1be8c7c21d19d..2d3ae0cd3ad25 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -430,10 +430,10 @@ static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - int err = 0; + int err; if (!(netdev->flags & IFF_UP)) - return err; + return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) From 5eecd85c77a254a43bde3212da8047b001745c9f Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 12:37:39 +0100 Subject: [PATCH 326/330] psample: adjust size if rate_as_probability is set If PSAMPLE_ATTR_SAMPLE_PROBABILITY flag is to be sent, the available size for the packet data has to be adjusted accordingly. Also, check the error code returned by nla_put_flag. Fixes: 7b1b2b60c63f ("net: psample: allow using rate as probability") Signed-off-by: Adrian Moreno Reviewed-by: Aaron Conole Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241217113739.3929300-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- net/psample/psample.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/psample/psample.c b/net/psample/psample.c index a0ddae8a65f91..25f92ba0840c6 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -393,7 +393,9 @@ void psample_sample_packet(struct psample_group *group, nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? - nla_total_size(md->user_cookie_len) : 0); /* user cookie */ + nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ + (md->rate_as_probability ? + nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -498,8 +500,9 @@ void psample_sample_packet(struct psample_group *group, md->user_cookie)) goto error; - if (md->rate_as_probability) - nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY); + if (md->rate_as_probability && + nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) + goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, From 51df947678360faf1967fe0bd1a40c681f634104 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 16 Dec 2024 21:23:24 -0800 Subject: [PATCH 327/330] octeontx2-pf: fix netdev memory leak in rvu_rep_create() When rvu_rep_devlink_port_register() fails, free_netdev(ndev) for this incomplete iteration before going to "exit:" label. Fixes: 9ed0343f561e ("octeontx2-pf: Add devlink port support") Reviewed-by: Przemek Kitszel Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20241217052326.1086191-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 232b10740c130..9e3fcbae5dee7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -680,8 +680,10 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack) ndev->features |= ndev->hw_features; eth_hw_addr_random(ndev); err = rvu_rep_devlink_port_register(rep); - if (err) + if (err) { + free_netdev(ndev); goto exit; + } SET_NETDEV_DEVLINK_PORT(ndev, &rep->dl_port); err = register_netdev(ndev); From b95c8c33ae687fcd3007cefa93907a6bd270119b Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 16 Dec 2024 21:23:25 -0800 Subject: [PATCH 328/330] octeontx2-pf: fix error handling of devlink port in rvu_rep_create() Unregister the devlink port when register_netdev() fails. Fixes: 9ed0343f561e ("octeontx2-pf: Add devlink port support") Reviewed-by: Przemek Kitszel Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20241217052326.1086191-2-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 9e3fcbae5dee7..04e08e06f30ff 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -690,6 +690,7 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack) if (err) { NL_SET_ERR_MSG_MOD(extack, "PFVF representor registration failed"); + rvu_rep_devlink_port_unregister(rep); free_netdev(ndev); goto exit; } From 572af9f284669d31d9175122bbef9bc62cea8ded Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 18 Dec 2024 12:51:06 +0900 Subject: [PATCH 329/330] net: mdiobus: fix an OF node reference leak fwnode_find_mii_timestamper() calls of_parse_phandle_with_fixed_args() but does not decrement the refcount of the obtained OF node. Add an of_node_put() call before returning from the function. This bug was detected by an experimental static analysis tool that I am developing. Fixes: bc1bee3b87ee ("net: mdiobus: Introduce fwnode_mdiobus_register_phy()") Signed-off-by: Joe Hattori Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218035106.1436405-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Paolo Abeni --- drivers/net/mdio/fwnode_mdio.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c index b156493d70841..aea0f03575689 100644 --- a/drivers/net/mdio/fwnode_mdio.c +++ b/drivers/net/mdio/fwnode_mdio.c @@ -40,6 +40,7 @@ fwnode_find_pse_control(struct fwnode_handle *fwnode) static struct mii_timestamper * fwnode_find_mii_timestamper(struct fwnode_handle *fwnode) { + struct mii_timestamper *mii_ts; struct of_phandle_args arg; int err; @@ -53,10 +54,16 @@ fwnode_find_mii_timestamper(struct fwnode_handle *fwnode) else if (err) return ERR_PTR(err); - if (arg.args_count != 1) - return ERR_PTR(-EINVAL); + if (arg.args_count != 1) { + mii_ts = ERR_PTR(-EINVAL); + goto put_node; + } + + mii_ts = register_mii_timestamper(arg.np, arg.args[0]); - return register_mii_timestamper(arg.np, arg.args[0]); +put_node: + of_node_put(arg.np); + return mii_ts; } int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio, From ce1219c3f76bb131d095e90521506d3c6ccfa086 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 18 Dec 2024 11:53:01 +0800 Subject: [PATCH 330/330] net: mctp: handle skb cleanup on sock_queue failures Currently, we don't use the return value from sock_queue_rcv_skb, which means we may leak skbs if a message is not successfully queued to a socket. Instead, ensure that we're freeing the skb where the sock hasn't otherwise taken ownership of the skb by adding checks on the sock_queue_rcv_skb() to invoke a kfree on failure. In doing so, rather than using the 'rc' value to trigger the kfree_skb(), use the skb pointer itself, which is more explicit. Also, add a kunit test for the sock delivery failure cases. Fixes: 4a992bbd3650 ("mctp: Implement message fragmentation & reassembly") Cc: stable@vger.kernel.org Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20241218-mctp-next-v2-1-1c1729645eaa@codeconstruct.com.au Signed-off-by: Paolo Abeni --- net/mctp/route.c | 36 +++++++++++----- net/mctp/test/route-test.c | 86 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index 597e9cf5aa644..3f2bd65ff5e3c 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -374,8 +374,13 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) msk = NULL; rc = -EINVAL; - /* we may be receiving a locally-routed packet; drop source sk - * accounting + /* We may be receiving a locally-routed packet; drop source sk + * accounting. + * + * From here, we will either queue the skb - either to a frag_queue, or + * to a receiving socket. When that succeeds, we clear the skb pointer; + * a non-NULL skb on exit will be otherwise unowned, and hence + * kfree_skb()-ed. */ skb_orphan(skb); @@ -434,7 +439,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * pending key. */ if (flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(&msk->sk, skb); + rc = sock_queue_rcv_skb(&msk->sk, skb); + if (!rc) + skb = NULL; if (key) { /* we've hit a pending reassembly; not much we * can do but drop it @@ -443,7 +450,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) MCTP_TRACE_KEY_REPLIED); key = NULL; } - rc = 0; goto out_unlock; } @@ -470,8 +476,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (!rc) + if (!rc) { trace_mctp_key_acquire(key); + skb = NULL; + } /* we don't need to release key->lock on exit, so * clean up here and suppress the unlock via @@ -489,6 +497,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) key = NULL; } else { rc = mctp_frag_queue(key, skb); + if (!rc) + skb = NULL; } } @@ -503,12 +513,19 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) else rc = mctp_frag_queue(key, skb); + if (rc) + goto out_unlock; + + /* we've queued; the queue owns the skb now */ + skb = NULL; + /* end of message? deliver to socket, and we're done with * the reassembly/response key */ - if (!rc && flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(key->sk, key->reasm_head); - key->reasm_head = NULL; + if (flags & MCTP_HDR_FLAG_EOM) { + rc = sock_queue_rcv_skb(key->sk, key->reasm_head); + if (!rc) + key->reasm_head = NULL; __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED); key = NULL; } @@ -527,8 +544,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (any_key) mctp_key_unref(any_key); out: - if (rc) - kfree_skb(skb); + kfree_skb(skb); return rc; } diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 8551dab1d1e69..17165b86ce22d 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -837,6 +837,90 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) mctp_test_route_input_multiple_nets_key_fini(test, &t2); } +/* Input route to socket, using a single-packet message, where sock delivery + * fails. Ensure we're handling the failure appropriately. + */ +static void mctp_test_route_input_sk_fail_single(struct kunit *test) +{ + const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + struct sk_buff *skb; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will + * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. + */ + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + skb = mctp_test_create_skb(&hdr, 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + skb_get(skb); + + mctp_test_skb_set_dev(skb, dev); + + /* do route input, which should fail */ + rc = mctp_route_input(&rt->rt, skb); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to skb */ + KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); + kfree_skb(skb); + + __mctp_route_test_fini(test, dev, rt, sock); +} + +/* Input route to socket, using a fragmented message, where sock delivery fails. + */ +static void mctp_test_route_input_sk_fail_frag(struct kunit *test) +{ + const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skbs[2]; + struct socket *sock; + unsigned int i; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + for (i = 0; i < ARRAY_SIZE(skbs); i++) { + skbs[i] = mctp_test_create_skb(&hdrs[i], 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skbs[i]); + skb_get(skbs[i]); + + mctp_test_skb_set_dev(skbs[i], dev); + } + + /* first route input should succeed, we're only queueing to the + * frag list + */ + rc = mctp_route_input(&rt->rt, skbs[0]); + KUNIT_EXPECT_EQ(test, rc, 0); + + /* final route input should fail to deliver to the socket */ + rc = mctp_route_input(&rt->rt, skbs[1]); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to both skbs */ + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[0]->users), 1); + kfree_skb(skbs[0]); + + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); + kfree_skb(skbs[1]); + + __mctp_route_test_fini(test, dev, rt, sock); +} + #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, @@ -1053,6 +1137,8 @@ static struct kunit_case mctp_test_cases[] = { mctp_route_input_sk_reasm_gen_params), KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys, mctp_route_input_sk_keys_gen_params), + KUNIT_CASE(mctp_test_route_input_sk_fail_single), + KUNIT_CASE(mctp_test_route_input_sk_fail_frag), KUNIT_CASE(mctp_test_route_input_multiple_nets_bind), KUNIT_CASE(mctp_test_route_input_multiple_nets_key), KUNIT_CASE(mctp_test_packet_flow),