Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Browse files Browse the repository at this point in the history
Pull KVM fixes from Paolo Bonzini:
 "arm64:

   - Fix a couple of bugs affecting pKVM's PSCI relay implementation
     when running in the hVHE mode, resulting in the host being entered
     with the MMU in an unknown state, and EL2 being in the wrong mode

  x86:

   - Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow

   - Ensure DEBUGCTL is context switched on AMD to avoid running the
     guest with the host's value, which can lead to unexpected bus lock
     #DBs

   - Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't
     properly emulate BTF. KVM's lack of context switching has meant BTF
     has always been broken to some extent

   - Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as
     the guest can enable DebugSwap without KVM's knowledge

   - Fix a bug in mmu_stress_tests where a vCPU could finish the "writes
     to RO memory" phase without actually generating a write-protection
     fault

   - Fix a printf() goof in the SEV smoke test that causes build
     failures with -Werror

   - Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when
     PERFMON_V2 isn't supported by KVM"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86: Explicitly zero EAX and EBX when PERFMON_V2 isn't supported by KVM
  KVM: selftests: Fix printf() format goof in SEV smoke test
  KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage
  KVM: SVM: Don't rely on DebugSwap to restore host DR0..DR3
  KVM: SVM: Save host DR masks on CPUs with DebugSwap
  KVM: arm64: Initialize SCTLR_EL1 in __kvm_hyp_init_cpu()
  KVM: arm64: Initialize HCR_EL2.E2H early
  KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs
  KVM: SVM: Manually context switch DEBUGCTL if LBR virtualization is disabled
  KVM: x86: Snapshot the host's DEBUGCTL in common x86
  KVM: SVM: Suppress DEBUGCTL.BTF on AMD
  KVM: SVM: Drop DEBUGCTL[5:2] from guest's effective value
  KVM: selftests: Assert that STI blocking isn't set after event injection
  KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI shadow
  • Loading branch information
Linus Torvalds committed Mar 9, 2025
2 parents 1110ce6 + ea9bd29 commit a382b06
Show file tree
Hide file tree
Showing 16 changed files with 130 additions and 62 deletions.
31 changes: 26 additions & 5 deletions arch/arm64/include/asm/el2_setup.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@
#include <asm/sysreg.h>
#include <linux/irqchip/arm-gic-v3.h>

.macro init_el2_hcr val
mov_q x0, \val

/*
* Compliant CPUs advertise their VHE-onlyness with
* ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it
* can reset into an UNKNOWN state and might not read as 1 until it has
* been initialized explicitly.
*
* Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but
* don't advertise it (they predate this relaxation).
*
* Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H
* indicating whether the CPU is running in E2H mode.
*/
mrs_s x1, SYS_ID_AA64MMFR4_EL1
sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH
cmp x1, #0
b.ge .LnVHE_\@

orr x0, x0, #HCR_E2H
.LnVHE_\@:
msr hcr_el2, x0
isb
.endm

.macro __init_el2_sctlr
mov_q x0, INIT_SCTLR_EL2_MMU_OFF
msr sctlr_el2, x0
Expand Down Expand Up @@ -244,11 +270,6 @@
.Lskip_gcs_\@:
.endm

.macro __init_el2_nvhe_prepare_eret
mov x0, #INIT_PSTATE_EL1
msr spsr_el2, x0
.endm

.macro __init_el2_mpam
/* Memory Partitioning And Monitoring: disable EL2 traps */
mrs x1, id_aa64pfr0_el1
Expand Down
22 changes: 3 additions & 19 deletions arch/arm64/kernel/head.S
Original file line number Diff line number Diff line change
Expand Up @@ -298,25 +298,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr sctlr_el2, x0
isb
0:
mov_q x0, HCR_HOST_NVHE_FLAGS

/*
* Compliant CPUs advertise their VHE-onlyness with
* ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be
* RES1 in that case. Publish the E2H bit early so that
* it can be picked up by the init_el2_state macro.
*
* Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but
* don't advertise it (they predate this relaxation).
*/
mrs_s x1, SYS_ID_AA64MMFR4_EL1
tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f

orr x0, x0, #HCR_E2H
1:
msr hcr_el2, x0
isb

init_el2_hcr HCR_HOST_NVHE_FLAGS
init_el2_state

/* Hypervisor stub */
Expand All @@ -339,7 +322,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr sctlr_el1, x1
mov x2, xzr
3:
__init_el2_nvhe_prepare_eret
mov x0, #INIT_PSTATE_EL1
msr spsr_el2, x0

mov w0, #BOOT_CPU_MODE_EL2
orr x0, x0, x2
Expand Down
10 changes: 7 additions & 3 deletions arch/arm64/kvm/hyp/nvhe/hyp-init.S
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,12 @@ __do_hyp_init:
eret
SYM_CODE_END(__kvm_hyp_init)

/*
* Initialize EL2 CPU state to sane values.
*
* HCR_EL2.E2H must have been initialized already.
*/
SYM_CODE_START_LOCAL(__kvm_init_el2_state)
/* Initialize EL2 CPU state to sane values. */
init_el2_state // Clobbers x0..x2
finalise_el2_state
ret
Expand Down Expand Up @@ -206,9 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)

2: msr SPsel, #1 // We want to use SP_EL{1,2}

bl __kvm_init_el2_state
init_el2_hcr 0

__init_el2_nvhe_prepare_eret
bl __kvm_init_el2_state

/* Enable MMU, set vectors and stack. */
mov x0, x28
Expand Down
3 changes: 3 additions & 0 deletions arch/arm64/kvm/hyp/nvhe/psci-relay.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
if (is_cpu_on)
release_boot_args(boot_args);

write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);

__host_enter(host_ctxt);
}

Expand Down
1 change: 1 addition & 0 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ struct kvm_vcpu_arch {
u32 pkru;
u32 hflags;
u64 efer;
u64 host_debugctl;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool load_eoi_exitmap_pending;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)

entry->ecx = entry->edx = 0;
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
entry->eax = entry->ebx;
entry->eax = entry->ebx = 0;
break;
}

Expand Down
24 changes: 17 additions & 7 deletions arch/x86/kvm/svm/sev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4590,6 +4590,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)

void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
struct kvm *kvm = svm->vcpu.kvm;

/*
* All host state for SEV-ES guests is categorized into three swap types
* based on how it is handled by hardware during a world switch:
Expand All @@ -4613,14 +4615,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are

/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
* saves and loads debug registers (Type-A).
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
* not save or load debug registers. Sadly, KVM can't prevent SNP
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
* the guest has actually enabled (or not!) in the VMSA.
*
* If DebugSwap is *possible*, save the masks so that they're restored
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
* rely on the CPU to restore the host values; KVM will restore them as
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
* don't need to be restored per se, KVM just needs to ensure they are
* loaded with the correct values *if* the CPU writes the MSRs.
*/
if (sev_vcpu_has_debug_swap(svm)) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
hostsa->dr3 = native_get_debugreg(3);
if (sev_vcpu_has_debug_swap(svm) ||
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
Expand Down
49 changes: 49 additions & 0 deletions arch/x86/kvm/svm/svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}

/*
* AMD changed the architectural behavior of bits 5:2. On CPUs
* without BusLockTrap, bits 5:2 control "external pins", but
* on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
* and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
* the guest to set bits 5:2 despite not actually virtualizing
* Performance-Monitoring/Breakpoint external pins. Drop bits
* 5:2 for backwards compatibility.
*/
data &= ~GENMASK(5, 2);

/*
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
* way to communicate lack of support to the guest.
*/
if (data & DEBUGCTLMSR_BTF) {
kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
data &= ~DEBUGCTLMSR_BTF;
}

if (data & DEBUGCTL_RESERVED_BITS)
return 1;

Expand Down Expand Up @@ -4189,6 +4210,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in

guest_state_enter_irqoff();

/*
* Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
* VMRUN controls whether or not physical IRQs are masked (KVM always
* runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
* temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
* into guest state if delivery of an event during VMRUN triggers a
* #VMEXIT, and the guest_state transitions already tell lockdep that
* IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
* this path, so IRQs aren't actually unmasked while running host code.
*/
raw_local_irq_enable();

amd_clear_divider();

if (sev_es_guest(vcpu->kvm))
Expand All @@ -4197,6 +4230,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);

raw_local_irq_disable();

guest_state_exit_irqoff();
}

Expand Down Expand Up @@ -4253,6 +4288,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
clgi();
kvm_load_guest_xsave_state(vcpu);

/*
* Hardware only context switches DEBUGCTL if LBR virtualization is
* enabled. Manually load DEBUGCTL if necessary (and restore it after
* VM-Exit), as running with the host's DEBUGCTL can negatively affect
* guest state and can even be fatal, e.g. due to Bus Lock Detect.
*/
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(svm->vmcb->save.dbgctl);

kvm_wait_lapic_expire(vcpu);

/*
Expand Down Expand Up @@ -4280,6 +4325,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);

if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(vcpu->arch.host_debugctl);

kvm_load_host_xsave_state(vcpu);
stgi();

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/svm/svm.h
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
/* svm.c */
#define MSR_INVALID 0xffffffffU

#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)

extern bool dump_invalid_vmcb;

Expand Down
10 changes: 1 addition & 9 deletions arch/x86/kvm/svm/vmenter.S
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
mov VCPU_RDI(%_ASM_DI), %_ASM_DI

/* Enter guest mode */
sti

3: vmrun %_ASM_AX
4:
cli

/* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX

Expand Down Expand Up @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov KVM_VMCB_pa(%rax), %rax

/* Enter guest mode */
sti

1: vmrun %rax

2: cli

2:
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT

Expand Down
8 changes: 2 additions & 6 deletions arch/x86/kvm/vmx/vmx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);

vmx_vcpu_load_vmcs(vcpu, cpu, NULL);

vmx_vcpu_pi_load(vcpu, cpu);

vmx->host_debugctlmsr = get_debugctlmsr();
}

void vmx_vcpu_put(struct kvm_vcpu *vcpu)
Expand Down Expand Up @@ -7458,8 +7454,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
}

/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
if (vcpu->arch.host_debugctl)
update_debugctlmsr(vcpu->arch.host_debugctl);

#ifndef CONFIG_X86_64
/*
Expand Down
2 changes: 0 additions & 2 deletions arch/x86/kvm/vmx/vmx.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;

unsigned long host_debugctlmsr;

/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -10968,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(0, 7);
}

vcpu->arch.host_debugctl = get_debugctlmsr();

guest_timing_enter_irqoff();

for (;;) {
Expand Down
Loading

0 comments on commit a382b06

Please sign in to comment.