Skip to content

Commit

Permalink
Merge tag 'kvm-x86-fixes-6.14-rcN.2' of https://github.com/kvm-x86/linux
Browse files Browse the repository at this point in the history
 into HEAD

KVM x86 fixes for 6.14-rcN #2

 - Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow.

 - Ensure DEBUGCTL is context switched on AMD to avoid running the guest with
   the host's value, which can lead to unexpected bus lock #DBs.

 - Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't properly
   emulate BTF.  KVM's lack of context switching has meant BTF has always been
   broken to some extent.

 - Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as the guest
   can enable DebugSwap without KVM's knowledge.

 - Fix a bug in mmu_stress_tests where a vCPU could finish the "writes to RO
   memory" phase without actually generating a write-protection fault.

 - Fix a printf() goof in the SEV smoke test that causes build failures with
   -Werror.

 - Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when PERFMON_V2
   isn't supported by KVM.
  • Loading branch information
Paolo Bonzini committed Mar 9, 2025
2 parents 1cdad67 + f9dc8fb commit ea9bd29
Show file tree
Hide file tree
Showing 12 changed files with 91 additions and 35 deletions.
1 change: 1 addition & 0 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ struct kvm_vcpu_arch {
u32 pkru;
u32 hflags;
u64 efer;
u64 host_debugctl;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool load_eoi_exitmap_pending;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)

entry->ecx = entry->edx = 0;
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
entry->eax = entry->ebx;
entry->eax = entry->ebx = 0;
break;
}

Expand Down
24 changes: 17 additions & 7 deletions arch/x86/kvm/svm/sev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4590,6 +4590,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)

void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
struct kvm *kvm = svm->vcpu.kvm;

/*
* All host state for SEV-ES guests is categorized into three swap types
* based on how it is handled by hardware during a world switch:
Expand All @@ -4613,14 +4615,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are

/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
* saves and loads debug registers (Type-A).
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
* not save or load debug registers. Sadly, KVM can't prevent SNP
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
* the guest has actually enabled (or not!) in the VMSA.
*
* If DebugSwap is *possible*, save the masks so that they're restored
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
* rely on the CPU to restore the host values; KVM will restore them as
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
* don't need to be restored per se, KVM just needs to ensure they are
* loaded with the correct values *if* the CPU writes the MSRs.
*/
if (sev_vcpu_has_debug_swap(svm)) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
hostsa->dr3 = native_get_debugreg(3);
if (sev_vcpu_has_debug_swap(svm) ||
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
Expand Down
49 changes: 49 additions & 0 deletions arch/x86/kvm/svm/svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}

/*
* AMD changed the architectural behavior of bits 5:2. On CPUs
* without BusLockTrap, bits 5:2 control "external pins", but
* on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
* and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
* the guest to set bits 5:2 despite not actually virtualizing
* Performance-Monitoring/Breakpoint external pins. Drop bits
* 5:2 for backwards compatibility.
*/
data &= ~GENMASK(5, 2);

/*
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
* way to communicate lack of support to the guest.
*/
if (data & DEBUGCTLMSR_BTF) {
kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
data &= ~DEBUGCTLMSR_BTF;
}

if (data & DEBUGCTL_RESERVED_BITS)
return 1;

Expand Down Expand Up @@ -4189,6 +4210,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in

guest_state_enter_irqoff();

/*
* Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
* VMRUN controls whether or not physical IRQs are masked (KVM always
* runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
* temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
* into guest state if delivery of an event during VMRUN triggers a
* #VMEXIT, and the guest_state transitions already tell lockdep that
* IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
* this path, so IRQs aren't actually unmasked while running host code.
*/
raw_local_irq_enable();

amd_clear_divider();

if (sev_es_guest(vcpu->kvm))
Expand All @@ -4197,6 +4230,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);

raw_local_irq_disable();

guest_state_exit_irqoff();
}

Expand Down Expand Up @@ -4253,6 +4288,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
clgi();
kvm_load_guest_xsave_state(vcpu);

/*
* Hardware only context switches DEBUGCTL if LBR virtualization is
* enabled. Manually load DEBUGCTL if necessary (and restore it after
* VM-Exit), as running with the host's DEBUGCTL can negatively affect
* guest state and can even be fatal, e.g. due to Bus Lock Detect.
*/
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(svm->vmcb->save.dbgctl);

kvm_wait_lapic_expire(vcpu);

/*
Expand Down Expand Up @@ -4280,6 +4325,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);

if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(vcpu->arch.host_debugctl);

kvm_load_host_xsave_state(vcpu);
stgi();

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/svm/svm.h
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
/* svm.c */
#define MSR_INVALID 0xffffffffU

#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)

extern bool dump_invalid_vmcb;

Expand Down
10 changes: 1 addition & 9 deletions arch/x86/kvm/svm/vmenter.S
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
mov VCPU_RDI(%_ASM_DI), %_ASM_DI

/* Enter guest mode */
sti

3: vmrun %_ASM_AX
4:
cli

/* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX

Expand Down Expand Up @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov KVM_VMCB_pa(%rax), %rax

/* Enter guest mode */
sti

1: vmrun %rax

2: cli

2:
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT

Expand Down
8 changes: 2 additions & 6 deletions arch/x86/kvm/vmx/vmx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);

vmx_vcpu_load_vmcs(vcpu, cpu, NULL);

vmx_vcpu_pi_load(vcpu, cpu);

vmx->host_debugctlmsr = get_debugctlmsr();
}

void vmx_vcpu_put(struct kvm_vcpu *vcpu)
Expand Down Expand Up @@ -7458,8 +7454,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
}

/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
if (vcpu->arch.host_debugctl)
update_debugctlmsr(vcpu->arch.host_debugctl);

#ifndef CONFIG_X86_64
/*
Expand Down
2 changes: 0 additions & 2 deletions arch/x86/kvm/vmx/vmx.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;

unsigned long host_debugctlmsr;

/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -10968,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(0, 7);
}

vcpu->arch.host_debugctl = get_debugctlmsr();

guest_timing_enter_irqoff();

for (;;) {
Expand Down
21 changes: 13 additions & 8 deletions tools/testing/selftests/kvm/mmu_stress_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "ucall_common.h"

static bool mprotect_ro_done;
static bool all_vcpus_hit_ro_fault;

static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
{
Expand All @@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)

/*
* Write to the region while mprotect(PROT_READ) is underway. Keep
* looping until the memory is guaranteed to be read-only, otherwise
* vCPUs may complete their writes and advance to the next stage
* prematurely.
* looping until the memory is guaranteed to be read-only and a fault
* has occurred, otherwise vCPUs may complete their writes and advance
* to the next stage prematurely.
*
* For architectures that support skipping the faulting instruction,
* generate the store via inline assembly to ensure the exact length
Expand All @@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
#else
vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
#endif
} while (!READ_ONCE(mprotect_ro_done));
} while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault));

/*
* Only architectures that write the entire range can explicitly sync,
Expand All @@ -81,6 +82,7 @@ struct vcpu_info {

static int nr_vcpus;
static atomic_t rendezvous;
static atomic_t nr_ro_faults;

static void rendezvous_with_boss(void)
{
Expand Down Expand Up @@ -148,12 +150,16 @@ static void *vcpu_worker(void *data)
* be stuck on the faulting instruction for other architectures. Go to
* stage 3 without a rendezvous
*/
do {
r = _vcpu_run(vcpu);
} while (!r);
r = _vcpu_run(vcpu);
TEST_ASSERT(r == -1 && errno == EFAULT,
"Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);

atomic_inc(&nr_ro_faults);
if (atomic_read(&nr_ro_faults) == nr_vcpus) {
WRITE_ONCE(all_vcpus_hit_ro_fault, true);
sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
}

#if defined(__x86_64__) || defined(__aarch64__)
/*
* Verify *all* writes from the guest hit EFAULT due to the VMA now
Expand Down Expand Up @@ -378,7 +384,6 @@ int main(int argc, char *argv[])
rendezvous_with_vcpus(&time_run2, "run 2");

mprotect(mem, slot_size, PROT_READ);
usleep(10);
mprotect_ro_done = true;
sync_global_to_guest(vm, mprotect_ro_done);

Expand Down
2 changes: 2 additions & 0 deletions tools/testing/selftests/kvm/x86/nested_exceptions_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,

GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
GUEST_ASSERT(!ctrl->int_state);
}

static void l1_svm_code(struct svm_test_data *svm)
Expand Down Expand Up @@ -122,6 +123,7 @@ static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO));
}

static void l1_vmx_code(struct vmx_pages *vmx)
Expand Down
3 changes: 2 additions & 1 deletion tools/testing/selftests/kvm/x86/sev_smoke_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ static void compare_xsave(u8 *from_host, u8 *from_guest)
bool bad = false;
for (i = 0; i < 4095; i++) {
if (from_host[i] != from_guest[i]) {
printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
printf("mismatch at %u | %02hhx %02hhx\n",
i, from_host[i], from_guest[i]);
bad = true;
}
}
Expand Down

0 comments on commit ea9bd29

Please sign in to comment.