Skip to content

Commit

Permalink
Merge tag 'kvm-x86-fixes-6.15-rcN' of https://github.com/kvm-x86/linux
Browse files Browse the repository at this point in the history
…into HEAD

KVM x86 fixes for 6.15-rcN

 - Forcibly leave SMM on SHUTDOWN interception on AMD CPUs to avoid causing
   problems due to KVM stuffing INIT on SHUTDOWN (KVM needs to sanitize the
   VMCB as its state is undefined after SHUTDOWN, emulating INIT is the
   least awful choice).

 - Track the valid sync/dirty fields in kvm_run as a u64 to ensure KVM
   KVM doesn't goof a sanity check in the future.

 - Free obsolete roots when (re)loading the MMU to fix a bug where
   pre-faulting memory can get stuck due to always encountering a stale
   root.

 - When dumping GHCB state, use KVM's snapshot instead of the raw GHCB page
   to print state, so that KVM doesn't print stale/wrong information.

 - When changing memory attributes (e.g. shared <=> private), add potential
   hugepage ranges to the mmu_invalidate_range_{start,end} set so that KVM
   doesn't create a shared/private hugepage when the the corresponding
   attributes will become mixed (the attributes are commited *after* KVM
   finishes the invalidation).

 - Rework the SRSO mitigation to enable BP_SPEC_REDUCE only when KVM has at
   least one active VM.  Effectively BP_SPEC_REDUCE when KVM is loaded led
   to very measurable performance regressions for non-KVM workloads.
  • Loading branch information
Paolo Bonzini committed May 10, 2025
2 parents 36867c0 + e3417ab commit add2032
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 37 deletions.
3 changes: 3 additions & 0 deletions arch/x86/kvm/mmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,

static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
kvm_mmu_free_obsolete_roots(vcpu);

/*
* Checking root.hpa is sufficient even when KVM has mirror root.
* We can have either:
Expand Down
70 changes: 54 additions & 16 deletions arch/x86/kvm/mmu/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots);

static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
int *bytes)
Expand Down Expand Up @@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
}

#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
}

static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
}

static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
}

bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range)
{
struct kvm_memory_slot *slot = range->slot;
int level;

/*
* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
Expand All @@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;

if (WARN_ON_ONCE(range->end <= range->start))
return false;

/*
* If the head and tail pages of the range currently allow a hugepage,
* i.e. reside fully in the slot and don't have mixed attributes, then
* add each corresponding hugepage range to the ongoing invalidation,
* e.g. to prevent KVM from creating a hugepage in response to a fault
* for a gfn whose attributes aren't changing. Note, only the range
* of gfns whose attributes are being modified needs to be explicitly
* unmapped, as that will unmap any existing hugepages.
*/
for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
gfn_t start = gfn_round_for_level(range->start, level);
gfn_t end = gfn_round_for_level(range->end - 1, level);
gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);

if ((start != range->start || start + nr_pages > range->end) &&
start >= slot->base_gfn &&
start + nr_pages <= slot->base_gfn + slot->npages &&
!hugepage_test_mixed(slot, start, level))
kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);

if (end == start)
continue;

if ((end + nr_pages) > range->end &&
(end + nr_pages) <= (slot->base_gfn + slot->npages) &&
!hugepage_test_mixed(slot, end, level))
kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
}

/* Unmap the old attribute page. */
if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
range->attr_filter = KVM_FILTER_SHARED;
Expand All @@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
return kvm_unmap_gfn_range(kvm, range);
}

static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
}

static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
}

static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
}

static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, int level, unsigned long attrs)
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kvm/smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)

kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_smm_changed);

void process_smi(struct kvm_vcpu *vcpu)
{
Expand Down
32 changes: 19 additions & 13 deletions arch/x86/kvm/svm/sev.c
Original file line number Diff line number Diff line change
Expand Up @@ -3173,9 +3173,14 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
kvfree(svm->sev_es.ghcb_sa);
}

static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
{
return (((u64)control->exit_code_hi) << 32) | control->exit_code;
}

static void dump_ghcb(struct vcpu_svm *svm)
{
struct ghcb *ghcb = svm->sev_es.ghcb;
struct vmcb_control_area *control = &svm->vmcb->control;
unsigned int nbits;

/* Re-use the dump_invalid_vmcb module parameter */
Expand All @@ -3184,18 +3189,24 @@ static void dump_ghcb(struct vcpu_svm *svm)
return;
}

nbits = sizeof(ghcb->save.valid_bitmap) * 8;
nbits = sizeof(svm->sev_es.valid_bitmap) * 8;

pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
/*
* Print KVM's snapshot of the GHCB values that were (unsuccessfully)
* used to handle the exit. If the guest has since modified the GHCB
* itself, dumping the raw GHCB won't help debug why KVM was unable to
* handle the VMGEXIT that KVM observed.
*/
pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
}

static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
Expand Down Expand Up @@ -3266,11 +3277,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}

static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
{
return (((u64)control->exit_code_hi) << 32) | control->exit_code;
}

static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
Expand Down
75 changes: 69 additions & 6 deletions arch/x86/kvm/svm/svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();

amd_pmu_disable_virt();

if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}

static int svm_enable_virtualization_cpu(void)
Expand Down Expand Up @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}

if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);

return 0;
}

Expand Down Expand Up @@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}

#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_SPINLOCK(srso_lock);
static atomic_t srso_nr_vms;

static void svm_srso_clear_bp_spec_reduce(void *ign)
{
struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);

if (!sd->bp_spec_reduce_set)
return;

msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
sd->bp_spec_reduce_set = false;
}

static void svm_srso_vm_destroy(void)
{
if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
return;

if (atomic_dec_return(&srso_nr_vms))
return;

guard(spinlock)(&srso_lock);

/*
* Verify a new VM didn't come along, acquire the lock, and increment
* the count before this task acquired the lock.
*/
if (atomic_read(&srso_nr_vms))
return;

on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
}

static void svm_srso_vm_init(void)
{
if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
return;

/*
* Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
* transition, i.e. destroying the last VM, is fully complete, e.g. so
* that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
*/
if (atomic_inc_not_zero(&srso_nr_vms))
return;

guard(spinlock)(&srso_lock);

atomic_inc(&srso_nr_vms);
}
#else
static void svm_srso_vm_init(void) { }
static void svm_srso_vm_destroy(void) { }
#endif

static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
Expand Down Expand Up @@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);

if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
!sd->bp_spec_reduce_set) {
sd->bp_spec_reduce_set = true;
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
svm->guest_state_loaded = true;
}

Expand Down Expand Up @@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
*/
if (!sev_es_guest(vcpu->kvm)) {
clear_page(svm->vmcb);
#ifdef CONFIG_KVM_SMM
if (is_smm(vcpu))
kvm_smm_changed(vcpu, false);
#endif
kvm_vcpu_reset(vcpu, true);
}

Expand Down Expand Up @@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);

svm_srso_vm_destroy();
}

static int svm_vm_init(struct kvm *kvm)
Expand All @@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}

svm_srso_vm_init();
return 0;
}

Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kvm/svm/svm.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;

bool bp_spec_reduce_set;

struct vmcb *save_area;
unsigned long save_area_pa;

Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -4597,7 +4597,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}

static inline u32 kvm_sync_valid_fields(struct kvm *kvm)
static inline u64 kvm_sync_valid_fields(struct kvm *kvm)
{
return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
}
Expand Down Expand Up @@ -11493,7 +11493,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
u32 sync_valid_fields;
u64 sync_valid_fields;
int r;

r = kvm_mmu_post_init_vm(vcpu->kvm);
Expand Down

0 comments on commit add2032

Please sign in to comment.