From be45bc4eff33d9a7dae84a2150f242a91a617402 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 08:54:41 -0800 Subject: [PATCH 01/14] KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI shadow Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff() so that VMRUN is not executed in an STI shadow. AMD CPUs have a quirk (some would say "bug"), where the STI shadow bleeds into the guest's intr_state field if a #VMEXIT occurs during injection of an event, i.e. if the VMRUN doesn't complete before the subsequent #VMEXIT. The spurious "interrupts masked" state is relatively benign, as it only occurs during event injection and is transient. Because KVM is already injecting an event, the guest can't be in HLT, and if KVM is querying IRQ blocking for injection, then KVM would need to force an immediate exit anyways since injecting multiple events is impossible. However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the spurious STI shadow is visible to L1 when running a nested VM, which can trip sanity checks, e.g. in VMware's VMM. Hoist the STI+CLI all the way to C code, as the aforementioned calls to guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already possible. I.e. if there's kernel code that is confused by running with RFLAGS.IF=1, then it's already a problem. In practice, since GIF=0 also blocks NMIs, the only change in exposure to non-KVM code (relative to surrounding VMRUN with STI+CLI) is exception handling code, and except for the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path are fatal. Use the "raw" variants to enable/disable IRQs to avoid tracing in the "no instrumentation" code; the guest state helpers also take care of tracing IRQ state. Oppurtunstically document why KVM needs to do STI in the first place. Reported-by: Doug Covelli Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com Fixes: f14eec0a3203 ("KVM: SVM: move more vmentry code to assembly") Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ arch/x86/kvm/svm/vmenter.S | 10 +--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a713c803a3a37..0d299f3f921e6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4189,6 +4189,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4197,6 +4209,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 2ed80aea3bb13..0c61153b275f6 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - 1: vmrun %rax - -2: cli - +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT From f3513a335e71296a1851167b4e3b0e2bf09fc5f1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 08:54:42 -0800 Subject: [PATCH 02/14] KVM: selftests: Assert that STI blocking isn't set after event injection Add an L1 (guest) assert to the nested exceptions test to verify that KVM doesn't put VMRUN in an STI shadow (AMD CPUs bleed the shadow into the guest's int_state if a #VMEXIT occurs before VMRUN fully completes). Add a similar assert to the VMX side as well, because why not. Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20250224165442.2338294-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/nested_exceptions_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c index 3eb0313ffa397..3641a42934acb 100644 --- a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c +++ b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c @@ -85,6 +85,7 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector, GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector)); GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code); + GUEST_ASSERT(!ctrl->int_state); } static void l1_svm_code(struct svm_test_data *svm) @@ -122,6 +123,7 @@ static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code) GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector); GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code); + GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO)); } static void l1_vmx_code(struct vmx_pages *vmx) From ee89e8013383d50a27ea9bf3c8a69eed6799856f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:06 -0800 Subject: [PATCH 03/14] KVM: SVM: Drop DEBUGCTL[5:2] from guest's effective value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop bits 5:2 from the guest's effective DEBUGCTL value, as AMD changed the architectural behavior of the bits and broke backwards compatibility. On CPUs without BusLockTrap (or at least, in APMs from before ~2023), bits 5:2 controlled the behavior of external pins: Performance-Monitoring/Breakpoint Pin-Control (PBi)—Bits 5:2, read/write. Software uses thesebits to control the type of information reported by the four external performance-monitoring/breakpoint pins on the processor. When a PBi bit is cleared to 0, the corresponding external pin (BPi) reports performance-monitor information. When a PBi bit is set to 1, the corresponding external pin (BPi) reports breakpoint information. With the introduction of BusLockTrap, presumably to be compatible with Intel CPUs, AMD redefined bit 2 to be BLCKDB: Bus Lock #DB Trap (BLCKDB)—Bit 2, read/write. Software sets this bit to enable generation of a #DB trap following successful execution of a bus lock when CPL is > 0. and redefined bits 5:3 (and bit 6) as "6:3 Reserved MBZ". Ideally, KVM would treat bits 5:2 as reserved. Defer that change to a feature cleanup to avoid breaking existing guest in LTS kernels. For now, drop the bits to retain backwards compatibility (of a sort). Note, dropping bits 5:2 is still a guest-visible change, e.g. if the guest is enabling LBRs *and* the legacy PBi bits, then the state of the PBi bits is visible to the guest, whereas now the guest will always see '0'. Reported-by: Ravi Bangoria Cc: stable@vger.kernel.org Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 12 ++++++++++++ arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0d299f3f921e6..bdafbde1f211a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3165,6 +3165,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * AMD changed the architectural behavior of bits 5:2. On CPUs + * without BusLockTrap, bits 5:2 control "external pins", but + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed + * the guest to set bits 5:2 despite not actually virtualizing + * Performance-Monitoring/Breakpoint external pins. Drop bits + * 5:2 for backwards compatibility. + */ + data &= ~GENMASK(5, 2); + if (data & DEBUGCTL_RESERVED_BITS) return 1; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9d7cdb8fbf872..3a931d3885e71 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) extern bool dump_invalid_vmcb; From d0eac42f5cecce009d315655bee341304fbe075e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:07 -0800 Subject: [PATCH 04/14] KVM: SVM: Suppress DEBUGCTL.BTF on AMD Mark BTF as reserved in DEBUGCTL on AMD, as KVM doesn't actually support BTF, and fully enabling BTF virtualization is non-trivial due to interactions with the emulator, guest_debug, #DB interception, nested SVM, etc. Don't inject #GP if the guest attempts to set BTF, as there's no way to communicate lack of support to the guest, and instead suppress the flag and treat the WRMSR as (partially) unsupported. In short, make KVM behave the same on AMD and Intel (VMX already squashes BTF). Note, due to other bugs in KVM's handling of DEBUGCTL, the only way BTF has "worked" in any capacity is if the guest simultaneously enables LBRs. Reported-by: Ravi Bangoria Cc: stable@vger.kernel.org Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 +++++++++ arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bdafbde1f211a..ed48465186961 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3177,6 +3177,15 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) */ data &= ~GENMASK(5, 2); + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3a931d3885e71..ea44c1da5a7c9 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; From fb71c795935652fa20eaf9517ca9547f5af99a76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:08 -0800 Subject: [PATCH 05/14] KVM: x86: Snapshot the host's DEBUGCTL in common x86 Move KVM's snapshot of DEBUGCTL to kvm_vcpu_arch and take the snapshot in common x86, so that SVM can also use the snapshot. Opportunistically change the field to a u64. While bits 63:32 are reserved on AMD, not mentioned at all in Intel's SDM, and managed as an "unsigned long" by the kernel, DEBUGCTL is an MSR and therefore a 64-bit value. Reviewed-by: Xiaoyao Li Cc: stable@vger.kernel.org Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 8 ++------ arch/x86/kvm/vmx/vmx.h | 2 -- arch/x86/kvm/x86.c | 1 + 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b7af5902ff75..32ae3aa50c7e3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -780,6 +780,7 @@ struct kvm_vcpu_arch { u32 pkru; u32 hflags; u64 efer; + u64 host_debugctl; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool load_eoi_exitmap_pending; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6c56d5235f0f3..3b92f893b2392 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -7458,8 +7454,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8b111ce1087c7..951e44dc9d0ea 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -340,8 +340,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02159c967d29e..5c6fd0edc41f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4968,6 +4968,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); + vcpu->arch.host_debugctl = get_debugctlmsr(); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { From 433265870ab3455b418885bff48fa5fd02f7e448 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:09 -0800 Subject: [PATCH 06/14] KVM: SVM: Manually context switch DEBUGCTL if LBR virtualization is disabled Manually load the guest's DEBUGCTL prior to VMRUN (and restore the host's value on #VMEXIT) if it diverges from the host's value and LBR virtualization is disabled, as hardware only context switches DEBUGCTL if LBR virtualization is fully enabled. Running the guest with the host's value has likely been mildly problematic for quite some time, e.g. it will result in undesirable behavior if BTF diverges (with the caveat that KVM now suppresses guest BTF due to lack of support). But the bug became fatal with the introduction of Bus Lock Trap ("Detect" in kernel paralance) support for AMD (commit 408eb7417a92 ("x86/bus_lock: Add support for AMD")), as a bus lock in the guest will trigger an unexpected #DB. Note, suppressing the bus lock #DB, i.e. simply resuming the guest without injecting a #DB, is not an option. It wouldn't address the general issue with DEBUGCTL, e.g. for things like BTF, and there are other guest-visible side effects if BusLockTrap is left enabled. If BusLockTrap is disabled, then DR6.BLD is reserved-to-1; any attempts to clear it by software are ignored. But if BusLockTrap is enabled, software can clear DR6.BLD: Software enables bus lock trap by setting DebugCtl MSR[BLCKDB] (bit 2) to 1. When bus lock trap is enabled, ... The processor indicates that this #DB was caused by a bus lock by clearing DR6[BLD] (bit 11). DR6[11] previously had been defined to be always 1. and clearing DR6.BLD is "sticky" in that it's not set (i.e. lowered) by other #DBs: All other #DB exceptions leave DR6[BLD] unmodified E.g. leaving BusLockTrap enable can confuse a legacy guest that writes '0' to reset DR6. Reported-by: rangemachine@gmail.com Reported-by: whanos@sergal.fun Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219787 Closes: https://lore.kernel.org/all/bug-219787-28872@https.bugzilla.kernel.org%2F Cc: Ravi Bangoria Cc: stable@vger.kernel.org Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ed48465186961..e67de787fc714 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4288,6 +4288,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4315,6 +4325,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); From 189ecdb3e112da703ac0699f4ec76aa78122f911 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:10 -0800 Subject: [PATCH 07/14] KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs Snapshot the host's DEBUGCTL after disabling IRQs, as perf can toggle debugctl bits from IRQ context, e.g. when enabling/disabling events via smp_call_function_single(). Taking the snapshot (long) before IRQs are disabled could result in KVM effectively clobbering DEBUGCTL due to using a stale snapshot. Cc: stable@vger.kernel.org Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c6fd0edc41f4..12d5f47c1bbe9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4968,7 +4968,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); - vcpu->arch.host_debugctl = get_debugctlmsr(); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -10969,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(0, 7); } + vcpu->arch.host_debugctl = get_debugctlmsr(); + guest_timing_enter_irqoff(); for (;;) { From 7a68b55ff39b0a1638acb1694c185d49f6077a0d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 27 Feb 2025 18:05:25 +0000 Subject: [PATCH 08/14] KVM: arm64: Initialize HCR_EL2.E2H early On CPUs without FEAT_E2H0, HCR_EL2.E2H is RES1, but may reset to an UNKNOWN value out of reset and consequently may not read as 1 unless it has been explicitly initialized. We handled this for the head.S boot code in commits: 3944382fa6f22b54 ("arm64: Treat HCR_EL2.E2H as RES1 when ID_AA64MMFR4_EL1.E2H0 is negative") b3320142f3db9b3f ("arm64: Fix early handling of FEAT_E2H0 not being implemented") Unfortunately, we forgot to apply a similar fix to the KVM PSCI entry points used when relaying CPU_ON, CPU_SUSPEND, and SYSTEM SUSPEND. When KVM is entered via these entry points, the value of HCR_EL2.E2H may be consumed before it has been initialized (e.g. by the 'init_el2_state' macro). Initialize HCR_EL2.E2H early in these paths such that it can be consumed reliably. The existing code in head.S is factored out into a new 'init_el2_hcr' macro, and this is used in the __kvm_hyp_init_cpu() function common to all the relevant PSCI entry points. For clarity, I've tweaked the assembly used to check whether ID_AA64MMFR4_EL1.E2H0 is negative. The bitfield is extracted as a signed value, and this is checked with a signed-greater-or-equal (GE) comparison. As the hyp code will reconfigure HCR_EL2 later in ___kvm_hyp_init(), all bits other than E2H are initialized to zero in __kvm_hyp_init_cpu(). Fixes: 3944382fa6f22b54 ("arm64: Treat HCR_EL2.E2H as RES1 when ID_AA64MMFR4_EL1.E2H0 is negative") Fixes: b3320142f3db9b3f ("arm64: Fix early handling of FEAT_E2H0 not being implemented") Signed-off-by: Mark Rutland Cc: Ahmed Genidi Cc: Ben Horgan Cc: Catalin Marinas Cc: Leo Yan Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250227180526.1204723-2-mark.rutland@arm.com [maz: fixed LT->GE thinko] Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/el2_setup.h | 26 ++++++++++++++++++++++++++ arch/arm64/kernel/head.S | 19 +------------------ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 8 +++++++- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 25e1626517500..56034a394b437 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -16,6 +16,32 @@ #include #include +.macro init_el2_hcr val + mov_q x0, \val + + /* + * Compliant CPUs advertise their VHE-onlyness with + * ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it + * can reset into an UNKNOWN state and might not read as 1 until it has + * been initialized explicitly. + * + * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but + * don't advertise it (they predate this relaxation). + * + * Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H + * indicating whether the CPU is running in E2H mode. + */ + mrs_s x1, SYS_ID_AA64MMFR4_EL1 + sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH + cmp x1, #0 + b.ge .LnVHE_\@ + + orr x0, x0, #HCR_E2H +.LnVHE_\@: + msr hcr_el2, x0 + isb +.endm + .macro __init_el2_sctlr mov_q x0, INIT_SCTLR_EL2_MMU_OFF msr sctlr_el2, x0 diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 5ab1970ee5436..2d56459d6c94c 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -298,25 +298,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el2, x0 isb 0: - mov_q x0, HCR_HOST_NVHE_FLAGS - - /* - * Compliant CPUs advertise their VHE-onlyness with - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be - * RES1 in that case. Publish the E2H bit early so that - * it can be picked up by the init_el2_state macro. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but - * don't advertise it (they predate this relaxation). - */ - mrs_s x1, SYS_ID_AA64MMFR4_EL1 - tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f - - orr x0, x0, #HCR_E2H -1: - msr hcr_el2, x0 - isb + init_el2_hcr HCR_HOST_NVHE_FLAGS init_el2_state /* Hypervisor stub */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index fc18662260676..3fb5504a7d7fc 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -73,8 +73,12 @@ __do_hyp_init: eret SYM_CODE_END(__kvm_hyp_init) +/* + * Initialize EL2 CPU state to sane values. + * + * HCR_EL2.E2H must have been initialized already. + */ SYM_CODE_START_LOCAL(__kvm_init_el2_state) - /* Initialize EL2 CPU state to sane values. */ init_el2_state // Clobbers x0..x2 finalise_el2_state ret @@ -206,6 +210,8 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} + init_el2_hcr 0 + bl __kvm_init_el2_state __init_el2_nvhe_prepare_eret From 3855a7b91d42ebf3513b7ccffc44807274978b3d Mon Sep 17 00:00:00 2001 From: Ahmed Genidi Date: Thu, 27 Feb 2025 18:05:26 +0000 Subject: [PATCH 09/14] KVM: arm64: Initialize SCTLR_EL1 in __kvm_hyp_init_cpu() When KVM is in protected mode, host calls to PSCI are proxied via EL2, and cold entries from CPU_ON, CPU_SUSPEND, and SYSTEM_SUSPEND bounce through __kvm_hyp_init_cpu() at EL2 before entering the host kernel's entry point at EL1. While __kvm_hyp_init_cpu() initializes SPSR_EL2 for the exception return to EL1, it does not initialize SCTLR_EL1. Due to this, it's possible to enter EL1 with SCTLR_EL1 in an UNKNOWN state. In practice this has been seen to result in kernel crashes after CPU_ON as a result of SCTLR_EL1.M being 1 in violation of the initial core configuration specified by PSCI. Fix this by initializing SCTLR_EL1 for cold entry to the host kernel. As it's necessary to write to SCTLR_EL12 in VHE mode, this initialization is moved into __kvm_host_psci_cpu_entry() where we can use write_sysreg_el1(). The remnants of the '__init_el2_nvhe_prepare_eret' macro are folded into its only caller, as this is clearer than having the macro. Fixes: cdf367192766ad11 ("KVM: arm64: Intercept host's CPU_ON SMCs") Reported-by: Leo Yan Signed-off-by: Ahmed Genidi [ Mark: clarify commit message, handle E2H, move to C, remove macro ] Signed-off-by: Mark Rutland Cc: Ahmed Genidi Cc: Ben Horgan Cc: Catalin Marinas Cc: Leo Yan Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Reviewed-by: Leo Yan Link: https://lore.kernel.org/r/20250227180526.1204723-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/el2_setup.h | 5 ----- arch/arm64/kernel/head.S | 3 ++- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 2 -- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 3 +++ 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 56034a394b437..555c613fd2324 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -270,11 +270,6 @@ .Lskip_gcs_\@: .endm -.macro __init_el2_nvhe_prepare_eret - mov x0, #INIT_PSTATE_EL1 - msr spsr_el2, x0 -.endm - .macro __init_el2_mpam /* Memory Partitioning And Monitoring: disable EL2 traps */ mrs x1, id_aa64pfr0_el1 diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 2d56459d6c94c..2ce73525de2c9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -322,7 +322,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el1, x1 mov x2, xzr 3: - __init_el2_nvhe_prepare_eret + mov x0, #INIT_PSTATE_EL1 + msr spsr_el2, x0 mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 3fb5504a7d7fc..f8af11189572f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -214,8 +214,6 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) bl __kvm_init_el2_state - __init_el2_nvhe_prepare_eret - /* Enable MMU, set vectors and stack. */ mov x0, x28 bl ___kvm_hyp_init // Clobbers x0..x2 diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 9c2ce1e0e99a5..c3e196fb8b18f 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) if (is_cpu_on) release_boot_args(boot_args); + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + __host_enter(host_ctxt); } From b2653cd3b75f62f29b72df4070e20357acb52bc4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Feb 2025 17:25:32 -0800 Subject: [PATCH 10/14] KVM: SVM: Save host DR masks on CPUs with DebugSwap When running SEV-SNP guests on a CPU that supports DebugSwap, always save the host's DR0..DR3 mask MSR values irrespective of whether or not DebugSwap is enabled, to ensure the host values aren't clobbered by the CPU. And for now, also save DR0..DR3, even though doing so isn't necessary (see below). SVM_VMGEXIT_AP_CREATE is deeply flawed in that it allows the *guest* to create a VMSA with guest-controlled SEV_FEATURES. A well behaved guest can inform the hypervisor, i.e. KVM, of its "requested" features, but on CPUs without ALLOWED_SEV_FEATURES support, nothing prevents the guest from lying about which SEV features are being enabled (or not!). If a misbehaving guest enables DebugSwap in a secondary vCPU's VMSA, the CPU will load the DR0..DR3 mask MSRs on #VMEXIT, i.e. will clobber the MSRs with '0' if KVM doesn't save its desired value. Note, DR0..DR3 themselves are "ok", as DR7 is reset on #VMEXIT, and KVM restores all DRs in common x86 code as needed via hw_breakpoint_restore(). I.e. there is no risk of host DR0..DR3 being clobbered (when it matters). However, there is a flaw in the opposite direction; because the guest can lie about enabling DebugSwap, i.e. can *disable* DebugSwap without KVM's knowledge, KVM must not rely on the CPU to restore DRs. Defer fixing that wart, as it's more of a documentation issue than a bug in the code. Note, KVM added support for DebugSwap on commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES"), but that is not an appropriate Fixes, as the underlying flaw exists in hardware, not in KVM. I.e. all kernels that support SEV-SNP need to be patched, not just kernels with KVM's full support for DebugSwap (ignoring that DebugSwap support landed first). Opportunistically fix an incorrect statement in the comment; on CPUs without DebugSwap, the CPU does NOT save or load debug registers, i.e. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Cc: stable@vger.kernel.org Cc: Naveen N Rao Cc: Kim Phillips Cc: Tom Lendacky Cc: Alexey Kardashevskiy Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250227012541.3234589-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2a794c320503..ef057c85a67ce 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4580,6 +4580,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -4603,10 +4605,15 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, on CPUs without + * ALLOWED_SEV_FEATURES, KVM can't prevent SNP guests from enabling + * DebugSwap on secondary vCPUs without KVM's knowledge via "AP Create". + * Save all registers if DebugSwap is supported to prevent host state + * from being clobbered by a misbehaving guest. */ - if (sev_vcpu_has_debug_swap(svm)) { + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0 = native_get_debugreg(0); hostsa->dr1 = native_get_debugreg(1); hostsa->dr2 = native_get_debugreg(2); From 807cb9ce2ed9a1b6e79e70fb2cdb7860f1517dcc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Feb 2025 17:25:33 -0800 Subject: [PATCH 11/14] KVM: SVM: Don't rely on DebugSwap to restore host DR0..DR3 Never rely on the CPU to restore/load host DR0..DR3 values, even if the CPU supports DebugSwap, as there are no guarantees that SNP guests will actually enable DebugSwap on APs. E.g. if KVM were to rely on the CPU to load DR0..DR3 and skipped them during hw_breakpoint_restore(), KVM would run with clobbered-to-zero DRs if an SNP guest created APs without DebugSwap enabled. Update the comment to explain the dangers, and hopefully prevent breaking KVM in the future. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250227012541.3234589-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ef057c85a67ce..080f8cecd7ca6 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4606,18 +4606,21 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does - * not save or load debug registers. Sadly, on CPUs without - * ALLOWED_SEV_FEATURES, KVM can't prevent SNP guests from enabling - * DebugSwap on secondary vCPUs without KVM's knowledge via "AP Create". - * Save all registers if DebugSwap is supported to prevent host state - * from being clobbered by a misbehaving guest. + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. */ if (sev_vcpu_has_debug_swap(svm) || (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); From d88ed5fb7c88f404e57fe2b2a6d19fefc35b4dc7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Feb 2025 15:08:04 -0800 Subject: [PATCH 12/14] KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage During the initial mprotect(RO) stage of mmu_stress_test, keep vCPUs spinning until all vCPUs have hit -EFAULT, i.e. until all vCPUs have tried to write to a read-only page. If a vCPU manages to complete an entire iteration of the loop without hitting a read-only page, *and* the vCPU observes mprotect_ro_done before starting a second iteration, then the vCPU will prematurely fall through to GUEST_SYNC(3) (on x86 and arm64) and get out of sequence. Replace the "do-while (!r)" loop around the associated _vcpu_run() with a single invocation, as barring a KVM bug, the vCPU is guaranteed to hit -EFAULT, and retrying on success is super confusion, hides KVM bugs, and complicates this fix. The do-while loop was semi-unintentionally added specifically to fudge around a KVM x86 bug, and said bug is unhittable without modifying the test to force x86 down the !(x86||arm64) path. On x86, if forced emulation is enabled, vcpu_arch_put_guest() may trigger emulation of the store to memory. Due a (very, very) longstanding bug in KVM x86's emulator, emulate writes to guest memory that fail during __kvm_write_guest_page() unconditionally return KVM_EXIT_MMIO. While that is desirable in the !memslot case, it's wrong in this case as the failure happens due to __copy_to_user() hitting a read-only page, not an emulated MMIO region. But as above, x86 only uses vcpu_arch_put_guest() if the __x86_64__ guards are clobbered to force x86 down the common path, and of course the unexpected MMIO is a KVM bug, i.e. *should* cause a test failure. Fixes: b6c304aec648 ("KVM: selftests: Verify KVM correctly handles mprotect(PROT_READ)") Reported-by: Yan Zhao Closes: https://lore.kernel.org/all/20250208105318.16861-1-yan.y.zhao@intel.com Debugged-by: Yan Zhao Reviewed-by: Yan Zhao Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20250228230804.3845860-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/mmu_stress_test.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index d9c76b4c0d88a..6a437d2be9fa4 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -18,6 +18,7 @@ #include "ucall_common.h" static bool mprotect_ro_done; +static bool all_vcpus_hit_ro_fault; static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) { @@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) /* * Write to the region while mprotect(PROT_READ) is underway. Keep - * looping until the memory is guaranteed to be read-only, otherwise - * vCPUs may complete their writes and advance to the next stage - * prematurely. + * looping until the memory is guaranteed to be read-only and a fault + * has occurred, otherwise vCPUs may complete their writes and advance + * to the next stage prematurely. * * For architectures that support skipping the faulting instruction, * generate the store via inline assembly to ensure the exact length @@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) #else vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa); #endif - } while (!READ_ONCE(mprotect_ro_done)); + } while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault)); /* * Only architectures that write the entire range can explicitly sync, @@ -81,6 +82,7 @@ struct vcpu_info { static int nr_vcpus; static atomic_t rendezvous; +static atomic_t nr_ro_faults; static void rendezvous_with_boss(void) { @@ -148,12 +150,16 @@ static void *vcpu_worker(void *data) * be stuck on the faulting instruction for other architectures. Go to * stage 3 without a rendezvous */ - do { - r = _vcpu_run(vcpu); - } while (!r); + r = _vcpu_run(vcpu); TEST_ASSERT(r == -1 && errno == EFAULT, "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno); + atomic_inc(&nr_ro_faults); + if (atomic_read(&nr_ro_faults) == nr_vcpus) { + WRITE_ONCE(all_vcpus_hit_ro_fault, true); + sync_global_to_guest(vm, all_vcpus_hit_ro_fault); + } + #if defined(__x86_64__) || defined(__aarch64__) /* * Verify *all* writes from the guest hit EFAULT due to the VMA now @@ -378,7 +384,6 @@ int main(int argc, char *argv[]) rendezvous_with_vcpus(&time_run2, "run 2"); mprotect(mem, slot_size, PROT_READ); - usleep(10); mprotect_ro_done = true; sync_global_to_guest(vm, mprotect_ro_done); From 3b2d3db368013729fd2167a0d91fec821dba807c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Feb 2025 15:38:52 -0800 Subject: [PATCH 13/14] KVM: selftests: Fix printf() format goof in SEV smoke test Print out the index of mismatching XSAVE bytes using unsigned decimal format. Some versions of clang complain about trying to print an integer as an unsigned char. x86/sev_smoke_test.c:55:51: error: format specifies type 'unsigned char' but the argument has type 'int' [-Werror,-Wformat] Fixes: 8c53183dbaa2 ("selftests: kvm: add test for transferring FPU state into VMSA") Link: https://lore.kernel.org/r/20250228233852.3855676-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/sev_smoke_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index a1a688e752666..d97816dc476a2 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -52,7 +52,8 @@ static void compare_xsave(u8 *from_host, u8 *from_guest) bool bad = false; for (i = 0; i < 4095; i++) { if (from_host[i] != from_guest[i]) { - printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + printf("mismatch at %u | %02hhx %02hhx\n", + i, from_host[i], from_guest[i]); bad = true; } } From f9dc8fb3afc968042bdaf4b6e445a9272071c9f3 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Tue, 4 Mar 2025 03:23:14 -0500 Subject: [PATCH 14/14] KVM: x86: Explicitly zero EAX and EBX when PERFMON_V2 isn't supported by KVM Fix a goof where KVM sets CPUID.0x80000022.EAX to CPUID.0x80000022.EBX instead of zeroing both when PERFMON_V2 isn't supported by KVM. In practice, barring a buggy CPU (or vCPU model when running nested) only the !enable_pmu case is affected, as KVM always supports PERFMON_V2 if it's available in hardware, i.e. CPUID.0x80000022.EBX will be '0' if PERFMON_V2 is unsupported. For the !enable_pmu case, the bug is relatively benign as KVM will refuse to enable PMU capabilities, but a VMM that reflects KVM's supported CPUID into the guest could inadvertently induce #GPs in the guest due to advertising support for MSRs that KVM refuses to emulate. Fixes: 94cdeebd8211 ("KVM: x86/cpuid: Add AMD CPUID ExtPerfMonAndDbg leaf 0x80000022") Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250304082314.472202-3-xiaoyao.li@intel.com [sean: massage shortlog and changelog, tag for stable] Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8eb3a88707f21..121edf1f2a79a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { - entry->eax = entry->ebx; + entry->eax = entry->ebx = 0; break; }