From ef01cac401f18647d62720cf773d7bb0541827da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 08:05:04 -0700 Subject: [PATCH 1/5] KVM: x86: Acquire SRCU in KVM_GET_MP_STATE to protect guest memory accesses Acquire a lock on kvm->srcu when userspace is getting MP state to handle a rather extreme edge case where "accepting" APIC events, i.e. processing pending INIT or SIPI, can trigger accesses to guest memory. If the vCPU is in L2 with INIT *and* a TRIPLE_FAULT request pending, then getting MP state will trigger a nested VM-Exit by way of ->check_nested_events(), and emuating the nested VM-Exit can access guest memory. The splat was originally hit by syzkaller on a Google-internal kernel, and reproduced on an upstream kernel by hacking the triple_fault_event_test selftest to stuff a pending INIT, store an MSR on VM-Exit (to generate a memory access on VMX), and do vcpu_mp_state_get() to trigger the scenario. ============================= WARNING: suspicious RCU usage 6.14.0-rc3-b112d356288b-vmx/pi_lockdep_false_pos-lock #3 Not tainted ----------------------------- include/linux/kvm_host.h:1058 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by triple_fault_ev/1256: #0: ffff88810df5a330 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0x8b/0x9a0 [kvm] stack backtrace: CPU: 11 UID: 1000 PID: 1256 Comm: triple_fault_ev Not tainted 6.14.0-rc3-b112d356288b-vmx #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x7f/0x90 lockdep_rcu_suspicious+0x144/0x190 kvm_vcpu_gfn_to_memslot+0x156/0x180 [kvm] kvm_vcpu_read_guest+0x3e/0x90 [kvm] read_and_check_msr_entry+0x2e/0x180 [kvm_intel] __nested_vmx_vmexit+0x550/0xde0 [kvm_intel] kvm_check_nested_events+0x1b/0x30 [kvm] kvm_apic_accept_events+0x33/0x100 [kvm] kvm_arch_vcpu_ioctl_get_mpstate+0x30/0x1d0 [kvm] kvm_vcpu_ioctl+0x33e/0x9a0 [kvm] __x64_sys_ioctl+0x8b/0xb0 do_syscall_64+0x6c/0x170 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250401150504.829812-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c841817a914a..3712dde0bf9d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11786,6 +11786,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11799,6 +11801,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); From 0297cdc12a87629ad904ac8c0630f7702f9a2d48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 07:22:38 -0700 Subject: [PATCH 2/5] KVM: selftests: Add option to rseq test to override /dev/cpu_dma_latency Add a "-l " param to the rseq test so that the user can override /dev/cpu_dma_latency, as described by the test's suggested workaround for not being able to complete enough migrations. cpu_dma_latency is not a normal file, even as far as procfs files go. Writes to cpu_dma_latency only persist so long as the file is open, e.g. so that the kernel automatically reverts back to a power-optimized state once the sensitive workload completes. Provide the necessary functionality instead of effectively forcing the user to write a non-obvious wrapper. Cc: Dongsheng Zhang Cc: Zide Chen Signed-off-by: Sean Christopherson Message-ID: <20250401142238.819487-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/rseq_test.c | 31 ++++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index e5898678bfab..1375fca80bcd 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -196,25 +196,27 @@ static void calc_min_max_cpu(void) static void help(const char *name) { puts(""); - printf("usage: %s [-h] [-u]\n", name); + printf("usage: %s [-h] [-u] [-l latency]\n", name); printf(" -u: Don't sanity check the number of successful KVM_RUNs\n"); + printf(" -l: Set /dev/cpu_dma_latency to suppress deep sleep states\n"); puts(""); exit(0); } int main(int argc, char *argv[]) { + int r, i, snapshot, opt, fd = -1, latency = -1; bool skip_sanity_check = false; - int r, i, snapshot; struct kvm_vm *vm; struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; - int opt; - while ((opt = getopt(argc, argv, "hu")) != -1) { + while ((opt = getopt(argc, argv, "hl:u")) != -1) { switch (opt) { case 'u': skip_sanity_check = true; + case 'l': + latency = atoi_paranoid(optarg); break; case 'h': default: @@ -243,6 +245,20 @@ int main(int argc, char *argv[]) pthread_create(&migration_thread, NULL, migration_worker, (void *)(unsigned long)syscall(SYS_gettid)); + if (latency >= 0) { + /* + * Writes to cpu_dma_latency persist only while the file is + * open, i.e. it allows userspace to provide guaranteed latency + * while running a workload. Keep the file open until the test + * completes, otherwise writing cpu_dma_latency is meaningless. + */ + fd = open("/dev/cpu_dma_latency", O_RDWR); + TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("open() /dev/cpu_dma_latency", fd)); + + r = write(fd, &latency, 4); + TEST_ASSERT(r >= 1, "Error setting /dev/cpu_dma_latency"); + } + for (i = 0; !done; i++) { vcpu_run(vcpu); TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, @@ -278,6 +294,9 @@ int main(int argc, char *argv[]) "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu); } + if (fd > 0) + close(fd); + /* * Sanity check that the test was able to enter the guest a reasonable * number of times, e.g. didn't get stalled too often/long waiting for @@ -293,8 +312,8 @@ int main(int argc, char *argv[]) TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2), "Only performed %d KVM_RUNs, task stalled too much?\n\n" " Try disabling deep sleep states to reduce CPU wakeup latency,\n" - " e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n" - " or run with -u to disable this sanity check.", i); + " e.g. via cpuidle.off=1 or via -l , or run with -u to\n" + " disable this sanity check.", i); pthread_join(migration_thread, NULL); From 81d480fdf8b7d9b13fe87e1f0516f89794708094 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:34:48 -0700 Subject: [PATCH 3/5] KVM: x86/mmu: Wrap sanity check on number of TDP MMU pages with KVM_PROVE_MMU Wrap the TDP MMU page counter in CONFIG_KVM_PROVE_MMU so that the sanity check is omitted from production builds, and more importantly to remove the atomic accesses to account pages. A one-off memory leak in production is relatively uninteresting, and a WARN_ON won't help mitigate a systemic issue; it's as much about helping triage memory leaks as it is about detecting them in the first place, and doesn't magically stop the leaks. I.e. production environments will be quite sad if a severe KVM bug escapes, regardless of whether or not KVM WARNs. Signed-off-by: Sean Christopherson Message-ID: <20250315023448.2358456-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/mmu/tdp_mmu.c | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a884ab544335..3bdae454a959 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1472,8 +1472,13 @@ struct kvm_arch { struct once nx_once; #ifdef CONFIG_X86_64 - /* The number of TDP MMU pages across all roots. */ +#ifdef CONFIG_KVM_PROVE_MMU + /* + * The number of TDP MMU pages across all roots. Used only to sanity + * check that KVM isn't leaking TDP MMU pages. + */ atomic64_t tdp_mmu_pages; +#endif /* * List of struct kvm_mmu_pages being used as roots. diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7cc0564f5f97..21a3b8166242 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -325,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** From 459a35111b0a890172a78d51c01b204e13a34a18 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:46:23 -0700 Subject: [PATCH 4/5] KVM: Allow building irqbypass.ko as as module when kvm.ko is a module Convert HAVE_KVM_IRQ_BYPASS into a tristate so that selecting IRQ_BYPASS_MANAGER follows KVM={m,y}, i.e. doesn't force irqbypass.ko to be built-in. Note, PPC allows building KVM as a module, but selects HAVE_KVM_IRQ_BYPASS from a boolean Kconfig, i.e. KVM PPC unnecessarily forces irqbpass.ko to be built-in. But that flaw is a longstanding PPC specific issue. Fixes: 61df71ee992d ("kvm: move "select IRQ_BYPASS_MANAGER" to common code") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250315024623.2363994-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/Kconfig | 2 +- virt/kvm/eventfd.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5438a1b446a6..291d49b9bf05 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2382,7 +2382,7 @@ static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 746e1f466aa6..727b542074e7 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -75,7 +75,7 @@ config KVM_COMPAT depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) config HAVE_KVM_IRQ_BYPASS - bool + tristate select IRQ_BYPASS_MANAGER config HAVE_KVM_VCPU_ASYNC_IOCTL diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 249ba5b72e9b..11e5d1e3f12e 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -149,7 +149,7 @@ irqfd_shutdown(struct work_struct *work) /* * It is now safe to release the object's resources */ -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) irq_bypass_unregister_consumer(&irqfd->consumer); #endif eventfd_ctx_put(irqfd->eventfd); @@ -274,7 +274,7 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) write_seqcount_end(&irqfd->irq_entry_sc); } -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) void __attribute__((weak)) kvm_arch_irq_bypass_stop( struct irq_bypass_consumer *cons) { @@ -424,7 +424,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) if (events & EPOLLIN) schedule_work(&irqfd->inject); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; @@ -609,14 +609,14 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry(irqfd, &kvm->irqfds.items, list) { -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) /* Under irqfds.lock, so can read irq_entry safely */ struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; #endif irqfd_update(kvm, irqfd); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { int ret = kvm_arch_update_irqfd_routing( From bc52ae0a708cb6fa3926d11c88e3c55e1171b4a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:41:02 -0700 Subject: [PATCH 5/5] KVM: x86: Explicitly zero-initialize on-stack CPUID unions Explicitly zero/empty-initialize the unions used for PMU related CPUID entries, instead of manually zeroing all fields (hopefully), or in the case of 0x80000022, relying on the compiler to clobber the uninitialized bitfields. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-ID: <20250315024102.2361628-1-seanjc@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5e4d4934c0d3..571c906ffcbf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1427,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1444,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1763,7 +1761,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {