From 6c9567e0850be2f0f94ab64fa6512413fd1a1eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 17 Feb 2025 14:13:56 +0100 Subject: [PATCH 01/26] KVM: s390: Don't use %pK through tracepoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restricted pointers ("%pK") are not meant to be used through TP_format(). It can unintentionally expose security sensitive, raw pointer values. Use regular pointer formatting instead. Link: https://lore.kernel.org/lkml/20250113171731-dc10e3c1-da64-4af0-b767-7c7070468023@linutronix.de/ Signed-off-by: Thomas Weißschuh Reviewed-by: Michael Mueller Link: https://lore.kernel.org/r/20250217-restricted-pointers-s390-v1-1-0e4ace75d8aa@linutronix.de Signed-off-by: Janosch Frank Message-ID: <20250217-restricted-pointers-s390-v1-1-0e4ace75d8aa@linutronix.de> --- arch/s390/kvm/trace-s390.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 9ac92dbf680db..9e28f165c114c 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu, __entry->sie_block = sie_block; ), - TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK", + TP_printk("create cpu %d at 0x%p, sie block at 0x%p", __entry->id, __entry->vcpu, __entry->sie_block) ); @@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css, __entry->kvm = kvm; ), - TP_printk("enabling channel I/O support (kvm @ %pK)\n", + TP_printk("enabling channel I/O support (kvm @ %p)\n", __entry->kvm) ); From 0c7fbae5bc782429c97d68dc40fb126748d7e352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 17 Feb 2025 14:13:57 +0100 Subject: [PATCH 02/26] KVM: s390: Don't use %pK through debug printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restricted pointers ("%pK") are only meant to be used when directly printing to a file from task context. Otherwise it can unintentionally expose security sensitive, raw pointer values. Use regular pointer formatting instead. Link: https://lore.kernel.org/lkml/20250113171731-dc10e3c1-da64-4af0-b767-7c7070468023@linutronix.de/ Signed-off-by: Thomas Weißschuh Reviewed-by: Michael Mueller Tested-by: Michael Mueller Link: https://lore.kernel.org/r/20250217-restricted-pointers-s390-v1-2-0e4ace75d8aa@linutronix.de Signed-off-by: Janosch Frank Message-ID: <20250217-restricted-pointers-s390-v1-2-0e4ace75d8aa@linutronix.de> --- arch/s390/kvm/intercept.c | 2 +- arch/s390/kvm/interrupt.c | 8 ++++---- arch/s390/kvm/kvm-s390.c | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 610dd44a948b2..a06a000f196ce 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -95,7 +95,7 @@ static int handle_validity(struct kvm_vcpu *vcpu) vcpu->stat.exit_validity++; trace_kvm_s390_intercept_validity(vcpu, viwhy); - KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy, + KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy, current->pid, vcpu->kvm); /* do not warn on invalid runtime instrumentation mode */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 07ff0e10cb7f5..c0558f0540073 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3161,7 +3161,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) if (!gi->origin) return; gisa_clear_ipm(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin); } void kvm_s390_gisa_init(struct kvm *kvm) @@ -3178,7 +3178,7 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->timer.function = gisa_vcpu_kicker; memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)virt_to_phys(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin); } void kvm_s390_gisa_enable(struct kvm *kvm) @@ -3219,7 +3219,7 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; - VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); + VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa); } void kvm_s390_gisa_disable(struct kvm *kvm) @@ -3468,7 +3468,7 @@ int __init kvm_s390_gib_init(u8 nisc) } } - KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc); goto out; out_unreg_gal: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ebecb96bacce7..9e427ba3aed42 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1020,7 +1020,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } mutex_unlock(&kvm->lock); VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%pK", + VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *) kvm->arch.gmap->asce); break; } @@ -3464,7 +3464,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_gisa_init(kvm); INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); kvm->arch.pv.set_aside = NULL; - KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); + KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid); return 0; out_err: @@ -3527,7 +3527,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); - KVM_EVENT(3, "vm 0x%pK destroyed", kvm); + KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ @@ -3648,7 +3648,7 @@ static int sca_switch_to_extended(struct kvm *kvm) free_page((unsigned long)old_sca); - VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)", + VM_EVENT(kvm, 2, "Switched to ESCA (0x%p -> 0x%p)", old_sca, kvm->arch.sca); return 0; } @@ -4025,7 +4025,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto out_free_sie_block; } - VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p", vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); From acfcaf90db1fa833236d9f8249b6099cf638e5d1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 27 Mar 2025 09:36:15 -0700 Subject: [PATCH 03/26] smccc: kvm_guest: Align with DISCOVER_IMPL_CPUS ABI The ABI of the hypercall requires that R2 and R3 are 0. Explicitly pass 0 for these parameters. Cc: Shameer Kolothum Fixes: 86edf6bdcf05 ("smccc/kvm_guest: Enable errata based on implementation CPUs") Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250327163613.2516073-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- drivers/firmware/smccc/kvm_guest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index 5767aed25cdc0..ac2d3cf8a776a 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -95,7 +95,7 @@ void __init kvm_arm_target_impl_cpu_init(void) for (i = 0; i < max_cpus; i++) { arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID, - i, &res); + i, 0, 0, &res); if (res.a0 != SMCCC_RET_SUCCESS) { pr_warn("Discovering target implementation CPUs failed\n"); goto mem_free; From 1f5bdd3b0c7000156d99faeed19bd522615b38e3 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 26 Mar 2025 12:06:59 +0800 Subject: [PATCH 04/26] smccc: kvm_guest: Remove unneeded semicolon Remove unnecessary semicolons reported by Coccinelle/coccicheck and the semantic patch at scripts/coccinelle/misc/semicolon.cocci. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20250326040659.1190696-1-nichen@iscas.ac.cn Signed-off-by: Oliver Upton --- drivers/firmware/smccc/kvm_guest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index ac2d3cf8a776a..a123c05cbc9e6 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -103,7 +103,7 @@ void __init kvm_arm_target_impl_cpu_init(void) target[i].midr = res.a1; target[i].revidr = res.a2; target[i].aidr = res.a3; - }; + } if (!cpu_errata_set_target_impl(max_cpus, target)) { pr_warn("Failed to set target implementation CPUs\n"); From fb8a3eba9c812b67f9cf5531e5b55d13a51e938e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 2 Apr 2025 13:17:23 -0700 Subject: [PATCH 05/26] KVM: arm64: Only read HPFAR_EL2 when value is architecturally valid KVM's logic for deciding when HPFAR_EL2 is UNKNOWN doesn't align with the architecture. Most notably, KVM assumes HPFAR_EL2 contains the faulting IPA even in the case of an SEA. Align the logic with the architecture rather than attempting to paraphrase it. Additionally, take the opportunity to improve the language around ARM erratum #834220 such that it actually describes the bug. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250402201725.2963645-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/esr.h | 22 ++++++++++-- arch/arm64/kvm/hyp/include/hyp/fault.h | 46 ++++++++++++++++---------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index d1b1a33f9a8b0..92fb26e908405 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -121,6 +121,15 @@ #define ESR_ELx_FSC_SEA_TTW(n) (0x14 + (n)) #define ESR_ELx_FSC_SECC (0x18) #define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) +#define ESR_ELx_FSC_ADDRSZ (0x00) + +/* + * Annoyingly, the negative levels for Address size faults aren't laid out + * contiguously (or in the desired order) + */ +#define ESR_ELx_FSC_ADDRSZ_nL(n) ((n) == -1 ? 0x25 : 0x2C) +#define ESR_ELx_FSC_ADDRSZ_L(n) ((n) < 0 ? ESR_ELx_FSC_ADDRSZ_nL(n) : \ + (ESR_ELx_FSC_ADDRSZ + (n))) /* Status codes for individual page table levels */ #define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) @@ -161,8 +170,6 @@ #define ESR_ELx_Xs_MASK (GENMASK_ULL(4, 0)) /* ISS field definitions for exceptions taken in to Hyp */ -#define ESR_ELx_FSC_ADDRSZ (0x00) -#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n)) #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) @@ -464,6 +471,17 @@ static inline bool esr_fsc_is_access_flag_fault(unsigned long esr) (esr == ESR_ELx_FSC_ACCESS_L(0)); } +static inline bool esr_fsc_is_addr_sz_fault(unsigned long esr) +{ + esr &= ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_ADDRSZ_L(3)) || + (esr == ESR_ELx_FSC_ADDRSZ_L(2)) || + (esr == ESR_ELx_FSC_ADDRSZ_L(1)) || + (esr == ESR_ELx_FSC_ADDRSZ_L(0)) || + (esr == ESR_ELx_FSC_ADDRSZ_L(-1)); +} + /* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */ static inline bool esr_iss_is_eretax(unsigned long esr) { diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 17df94570f03a..59409685c14f7 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -44,31 +44,41 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) return true; } +/* + * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR. + */ +static inline bool __hpfar_valid(u64 esr) +{ + /* + * CPUs affected by ARM erratum #834220 may incorrectly report a + * stage-2 translation fault when a stage-1 permission fault occurs. + * + * Re-walk the page tables to determine if a stage-1 fault actually + * occurred. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_834220) && + esr_fsc_is_translation_fault(esr)) + return false; + + if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr)) + return true; + + if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr)) + return true; + + return esr_fsc_is_addr_sz_fault(esr); +} + static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) { u64 hpfar, far; far = read_sysreg_el2(SYS_FAR); - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 - * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. - */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - esr_fsc_is_permission_fault(esr))) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { + if (__hpfar_valid(esr)) hpfar = read_sysreg(hpfar_el2); - } + else if (!__translate_far_to_hpfar(far, &hpfar)) + return false; fault->far_el2 = far; fault->hpfar_el2 = hpfar; From 1cf3e126f1528cdcaf77524f48e54ccbcb029473 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 2 Apr 2025 13:17:24 -0700 Subject: [PATCH 06/26] arm64: Convert HPFAR_EL2 to sysreg table Switch over to the typical sysreg table for HPFAR_EL2 as we're about to start using more fields in the register. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250402201725.2963645-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 4 +++- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- arch/arm64/tools/sysreg | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index d7cf66573acaf..44e3fc6483c8d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -305,7 +305,9 @@ static __always_inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vc static __always_inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu) { - return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8; + u64 hpfar = vcpu->arch.fault.hpfar_el2; + + return FIELD_GET(HPFAR_EL2_FIPA, hpfar) << 12; } static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index f34f11c720d70..5ce2230054d98 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -578,7 +578,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) return; } - addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; + addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; ret = host_stage2_idmap(addr); BUG_ON(ret && ret != -EAGAIN); } diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 2c63662c1a489..31ad9ce2b91c5 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3433,3 +3433,10 @@ Field 5 F Field 4 P Field 3:0 Align EndSysreg + +Sysreg HPFAR_EL2 3 4 6 0 4 +Field 63 NS +Res0 62:48 +Field 47:4 FIPA +Res0 3:0 +EndSysreg From 26fbdf36922711f285fd185ad644f0acdf15959f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 2 Apr 2025 13:17:25 -0700 Subject: [PATCH 07/26] KVM: arm64: Don't translate FAR if invalid/unsafe Don't re-walk the page tables if an SEA occurred during the faulting page table walk to avoid taking a fatal exception in the hyp. Additionally, check that FAR_EL2 is valid for SEAs not taken on PTW as the architecture doesn't guarantee it contains the fault VA. Finally, fix up the rest of the abort path by checking for SEAs early and bugging the VM if we get further along with an UNKNOWN fault IPA. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250402201725.2963645-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/esr.h | 22 ++++++++++++++++++ arch/arm64/include/asm/kvm_emulate.h | 3 +++ arch/arm64/include/asm/kvm_ras.h | 2 +- arch/arm64/kvm/hyp/include/hyp/fault.h | 26 ++++++++++++++++----- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 7 ++++++ arch/arm64/kvm/mmu.c | 31 ++++++++++++++++---------- 6 files changed, 73 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 92fb26e908405..e4f77757937e6 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -482,6 +482,28 @@ static inline bool esr_fsc_is_addr_sz_fault(unsigned long esr) (esr == ESR_ELx_FSC_ADDRSZ_L(-1)); } +static inline bool esr_fsc_is_sea_ttw(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_SEA_TTW(3)) || + (esr == ESR_ELx_FSC_SEA_TTW(2)) || + (esr == ESR_ELx_FSC_SEA_TTW(1)) || + (esr == ESR_ELx_FSC_SEA_TTW(0)) || + (esr == ESR_ELx_FSC_SEA_TTW(-1)); +} + +static inline bool esr_fsc_is_secc_ttw(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_SECC_TTW(3)) || + (esr == ESR_ELx_FSC_SECC_TTW(2)) || + (esr == ESR_ELx_FSC_SECC_TTW(1)) || + (esr == ESR_ELx_FSC_SECC_TTW(0)) || + (esr == ESR_ELx_FSC_SECC_TTW(-1)); +} + /* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */ static inline bool esr_iss_is_eretax(unsigned long esr) { diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 44e3fc6483c8d..bd020fc28aa9c 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -307,6 +307,9 @@ static __always_inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu { u64 hpfar = vcpu->arch.fault.hpfar_el2; + if (unlikely(!(hpfar & HPFAR_EL2_NS))) + return INVALID_GPA; + return FIELD_GET(HPFAR_EL2_FIPA, hpfar) << 12; } diff --git a/arch/arm64/include/asm/kvm_ras.h b/arch/arm64/include/asm/kvm_ras.h index 87e10d9a635b5..9398ade632aaf 100644 --- a/arch/arm64/include/asm/kvm_ras.h +++ b/arch/arm64/include/asm/kvm_ras.h @@ -14,7 +14,7 @@ * Was this synchronous external abort a RAS notification? * Returns '0' for errors handled by some RAS subsystem, or -ENOENT. */ -static inline int kvm_handle_guest_sea(phys_addr_t addr, u64 esr) +static inline int kvm_handle_guest_sea(void) { /* apei_claim_sea(NULL) expects to mask interrupts itself */ lockdep_assert_irqs_enabled(); diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 59409685c14f7..fc573fc767b0e 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -12,6 +12,16 @@ #include #include +static inline bool __fault_safe_to_translate(u64 esr) +{ + u64 fsc = esr & ESR_ELx_FSC; + + if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr)) + return false; + + return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV)); +} + static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { int ret; @@ -71,17 +81,23 @@ static inline bool __hpfar_valid(u64 esr) static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) { - u64 hpfar, far; + u64 hpfar; - far = read_sysreg_el2(SYS_FAR); + fault->far_el2 = read_sysreg_el2(SYS_FAR); + fault->hpfar_el2 = 0; if (__hpfar_valid(esr)) hpfar = read_sysreg(hpfar_el2); - else if (!__translate_far_to_hpfar(far, &hpfar)) + else if (unlikely(!__fault_safe_to_translate(esr))) + return true; + else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar)) return false; - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; + /* + * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid + * HPFAR value. + */ + fault->hpfar_el2 = hpfar | HPFAR_EL2_NS; return true; } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 5ce2230054d98..2a5284f749b42 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -578,7 +578,14 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) return; } + + /* + * Yikes, we couldn't resolve the fault IPA. This should reinject an + * abort into the host when we figure out how to do that. + */ + BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS)); addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; + ret = host_stage2_idmap(addr); BUG_ON(ret && ret != -EAGAIN); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 2feb6c6b63af6..754f2fe0cc673 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1794,9 +1794,28 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) gfn_t gfn; int ret, idx; + /* Synchronous External Abort? */ + if (kvm_vcpu_abt_issea(vcpu)) { + /* + * For RAS the host kernel may handle this abort. + * There is no need to pass the error into the guest. + */ + if (kvm_handle_guest_sea()) + kvm_inject_vabt(vcpu); + + return 1; + } + esr = kvm_vcpu_get_esr(vcpu); + /* + * The fault IPA should be reliable at this point as we're not dealing + * with an SEA. + */ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) + return -EFAULT; + is_iabt = kvm_vcpu_trap_is_iabt(vcpu); if (esr_fsc_is_translation_fault(esr)) { @@ -1818,18 +1837,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } } - /* Synchronous External Abort? */ - if (kvm_vcpu_abt_issea(vcpu)) { - /* - * For RAS the host kernel may handle this abort. - * There is no need to pass the error into the guest. - */ - if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) - kvm_inject_vabt(vcpu); - - return 1; - } - trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); From 80fd663590cf4c6a7baaa405cd65060469c95eca Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 20 Mar 2025 11:42:55 -0400 Subject: [PATCH 08/26] selftests: kvm: revamp MONITOR/MWAIT tests Run each testcase in a separate VMs to cover more possibilities; move WRMSR close to MONITOR/MWAIT to test updating CPUID bits while in the VM. Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86/monitor_mwait_test.c | 108 +++++++++--------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86/monitor_mwait_test.c index 2b550eff35f1b..390ae2d874932 100644 --- a/tools/testing/selftests/kvm/x86/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86/monitor_mwait_test.c @@ -7,6 +7,7 @@ #include "kvm_util.h" #include "processor.h" +#include "kselftest.h" #define CPUID_MWAIT (1u << 3) @@ -14,6 +15,8 @@ enum monitor_mwait_testcases { MWAIT_QUIRK_DISABLED = BIT(0), MISC_ENABLES_QUIRK_DISABLED = BIT(1), MWAIT_DISABLED = BIT(2), + CPUID_DISABLED = BIT(3), + TEST_MAX = CPUID_DISABLED * 2 - 1, }; /* @@ -35,11 +38,19 @@ do { \ testcase, vector); \ } while (0) -static void guest_monitor_wait(int testcase) +static void guest_monitor_wait(void *arg) { + int testcase = (int) (long) arg; u8 vector; - GUEST_SYNC(testcase); + u64 val = rdmsr(MSR_IA32_MISC_ENABLE) & ~MSR_IA32_MISC_ENABLE_MWAIT; + if (!(testcase & MWAIT_DISABLED)) + val |= MSR_IA32_MISC_ENABLE_MWAIT; + wrmsr(MSR_IA32_MISC_ENABLE, val); + + __GUEST_ASSERT(this_cpu_has(X86_FEATURE_MWAIT) == !(testcase & MWAIT_DISABLED), + "Expected CPUID.MWAIT %s\n", + (testcase & MWAIT_DISABLED) ? "cleared" : "set"); /* * Arbitrarily MONITOR this function, SVM performs fault checks before @@ -50,19 +61,6 @@ static void guest_monitor_wait(int testcase) vector = kvm_asm_safe("mwait", "a"(guest_monitor_wait), "c"(0), "d"(0)); GUEST_ASSERT_MONITOR_MWAIT("MWAIT", testcase, vector); -} - -static void guest_code(void) -{ - guest_monitor_wait(MWAIT_DISABLED); - - guest_monitor_wait(MWAIT_QUIRK_DISABLED | MWAIT_DISABLED); - - guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_DISABLED); - guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED); - - guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED | MWAIT_DISABLED); - guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED); GUEST_DONE(); } @@ -74,56 +72,64 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; struct ucall uc; int testcase; + char test[80]; - TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); + ksft_print_header(); + ksft_set_plan(12); + for (testcase = 0; testcase <= TEST_MAX; testcase++) { + vm = vm_create_with_one_vcpu(&vcpu, guest_monitor_wait); + vcpu_args_set(vcpu, 1, (void *)(long)testcase); + + disabled_quirks = 0; + if (testcase & MWAIT_QUIRK_DISABLED) { + disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS; + strcpy(test, "MWAIT can fault"); + } else { + strcpy(test, "MWAIT never faults"); + } + if (testcase & MISC_ENABLES_QUIRK_DISABLED) { + disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; + strcat(test, ", MISC_ENABLE updates CPUID"); + } else { + strcat(test, ", no CPUID updates"); + } + + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); + + if (!(testcase & MISC_ENABLES_QUIRK_DISABLED) && + (!!(testcase & CPUID_DISABLED) ^ !!(testcase & MWAIT_DISABLED))) + continue; + + if (testcase & CPUID_DISABLED) { + strcat(test, ", CPUID clear"); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); + } else { + strcat(test, ", CPUID set"); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_MWAIT); + } + + if (testcase & MWAIT_DISABLED) + strcat(test, ", MWAIT disabled"); - while (1) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); switch (get_ucall(vcpu, &uc)) { - case UCALL_SYNC: - testcase = uc.args[1]; - break; case UCALL_ABORT: - REPORT_GUEST_ASSERT(uc); - goto done; + /* Detected in vcpu_run */ + break; case UCALL_DONE: - goto done; + ksft_test_result_pass("%s\n", test); + break; default: TEST_FAIL("Unknown ucall %lu", uc.cmd); - goto done; - } - - disabled_quirks = 0; - if (testcase & MWAIT_QUIRK_DISABLED) - disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS; - if (testcase & MISC_ENABLES_QUIRK_DISABLED) - disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; - vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); - - /* - * If the MISC_ENABLES quirk (KVM neglects to update CPUID to - * enable/disable MWAIT) is disabled, toggle the ENABLE_MWAIT - * bit in MISC_ENABLES accordingly. If the quirk is enabled, - * the only valid configuration is MWAIT disabled, as CPUID - * can't be manually changed after running the vCPU. - */ - if (!(testcase & MISC_ENABLES_QUIRK_DISABLED)) { - TEST_ASSERT(testcase & MWAIT_DISABLED, - "Can't toggle CPUID features after running vCPU"); - continue; + break; } - - vcpu_set_msr(vcpu, MSR_IA32_MISC_ENABLE, - (testcase & MWAIT_DISABLED) ? 0 : MSR_IA32_MISC_ENABLE_MWAIT); + kvm_vm_free(vm); } + ksft_finished(); -done: - kvm_vm_free(vm); return 0; } From 11934771e7e79dcf4528803f9e3299b214c36f30 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 00:18:51 +0200 Subject: [PATCH 09/26] selftests: kvm: bring list of exit reasons up to date Signed-off-by: Paolo Bonzini Message-ID: <20250331221851.614582-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 279ad8946040c..815bc45dd8dc6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2019,9 +2019,8 @@ static struct exit_reason { KVM_EXIT_STRING(RISCV_SBI), KVM_EXIT_STRING(RISCV_CSR), KVM_EXIT_STRING(NOTIFY), -#ifdef KVM_EXIT_MEMORY_NOT_PRESENT - KVM_EXIT_STRING(MEMORY_NOT_PRESENT), -#endif + KVM_EXIT_STRING(LOONGARCH_IOCSR), + KVM_EXIT_STRING(MEMORY_FAULT), }; /* From c57047f6f37906cc4f6a4fec1683f87731f25248 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 16:13:27 +0200 Subject: [PATCH 10/26] selftests: kvm: list once tests that are valid on all architectures Several tests cover infrastructure from virt/kvm/ and userspace APIs that have only minimal requirements from architecture-specific code. As such, they are available on all architectures that have libkvm support, and this presumably will apply also in the future (for example if loongarch gets selftests support). Put them in a separate variable and list them only once. Signed-off-by: Paolo Bonzini Message-ID: <20250401141327.785520-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile.kvm | 45 ++++++++---------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f773f8f992494..f62b0a5aba35a 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -50,8 +50,18 @@ LIBKVM_riscv += lib/riscv/ucall.c # Non-compiled test targets TEST_PROGS_x86 += x86/nx_huge_pages_test.sh +# Compiled test targets valid on all architectures with libkvm support +TEST_GEN_PROGS_COMMON = demand_paging_test +TEST_GEN_PROGS_COMMON += dirty_log_test +TEST_GEN_PROGS_COMMON += guest_print_test +TEST_GEN_PROGS_COMMON += kvm_binary_stats_test +TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus +TEST_GEN_PROGS_COMMON += kvm_page_table_test +TEST_GEN_PROGS_COMMON += set_memory_region_test + # Compiled test targets -TEST_GEN_PROGS_x86 = x86/cpuid_test +TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON) +TEST_GEN_PROGS_x86 += x86/cpuid_test TEST_GEN_PROGS_x86 += x86/cr4_cpuid_sync_test TEST_GEN_PROGS_x86 += x86/dirty_log_page_splitting_test TEST_GEN_PROGS_x86 += x86/feature_msrs_test @@ -119,27 +129,21 @@ TEST_GEN_PROGS_x86 += x86/triple_fault_event_test TEST_GEN_PROGS_x86 += x86/recalc_apic_map_test TEST_GEN_PROGS_x86 += access_tracking_perf_test TEST_GEN_PROGS_x86 += coalesced_io_test -TEST_GEN_PROGS_x86 += demand_paging_test -TEST_GEN_PROGS_x86 += dirty_log_test TEST_GEN_PROGS_x86 += dirty_log_perf_test TEST_GEN_PROGS_x86 += guest_memfd_test -TEST_GEN_PROGS_x86 += guest_print_test TEST_GEN_PROGS_x86 += hardware_disable_test -TEST_GEN_PROGS_x86 += kvm_create_max_vcpus -TEST_GEN_PROGS_x86 += kvm_page_table_test TEST_GEN_PROGS_x86 += memslot_modification_stress_test TEST_GEN_PROGS_x86 += memslot_perf_test TEST_GEN_PROGS_x86 += mmu_stress_test TEST_GEN_PROGS_x86 += rseq_test -TEST_GEN_PROGS_x86 += set_memory_region_test TEST_GEN_PROGS_x86 += steal_time -TEST_GEN_PROGS_x86 += kvm_binary_stats_test TEST_GEN_PROGS_x86 += system_counter_offset_test TEST_GEN_PROGS_x86 += pre_fault_memory_test # Compiled outputs used by test targets TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test +TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases TEST_GEN_PROGS_arm64 += arm64/debug-exceptions @@ -158,22 +162,16 @@ TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 TEST_GEN_PROGS_arm64 += access_tracking_perf_test TEST_GEN_PROGS_arm64 += arch_timer TEST_GEN_PROGS_arm64 += coalesced_io_test -TEST_GEN_PROGS_arm64 += demand_paging_test -TEST_GEN_PROGS_arm64 += dirty_log_test TEST_GEN_PROGS_arm64 += dirty_log_perf_test -TEST_GEN_PROGS_arm64 += guest_print_test TEST_GEN_PROGS_arm64 += get-reg-list -TEST_GEN_PROGS_arm64 += kvm_create_max_vcpus -TEST_GEN_PROGS_arm64 += kvm_page_table_test TEST_GEN_PROGS_arm64 += memslot_modification_stress_test TEST_GEN_PROGS_arm64 += memslot_perf_test TEST_GEN_PROGS_arm64 += mmu_stress_test TEST_GEN_PROGS_arm64 += rseq_test -TEST_GEN_PROGS_arm64 += set_memory_region_test TEST_GEN_PROGS_arm64 += steal_time -TEST_GEN_PROGS_arm64 += kvm_binary_stats_test -TEST_GEN_PROGS_s390 = s390/memop +TEST_GEN_PROGS_s390 = $(TEST_GEN_PROGS_COMMON) +TEST_GEN_PROGS_s390 += s390/memop TEST_GEN_PROGS_s390 += s390/resets TEST_GEN_PROGS_s390 += s390/sync_regs_test TEST_GEN_PROGS_s390 += s390/tprot @@ -182,27 +180,14 @@ TEST_GEN_PROGS_s390 += s390/debug_test TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test TEST_GEN_PROGS_s390 += s390/shared_zeropage_test TEST_GEN_PROGS_s390 += s390/ucontrol_test -TEST_GEN_PROGS_s390 += demand_paging_test -TEST_GEN_PROGS_s390 += dirty_log_test -TEST_GEN_PROGS_s390 += guest_print_test -TEST_GEN_PROGS_s390 += kvm_create_max_vcpus -TEST_GEN_PROGS_s390 += kvm_page_table_test TEST_GEN_PROGS_s390 += rseq_test -TEST_GEN_PROGS_s390 += set_memory_region_test -TEST_GEN_PROGS_s390 += kvm_binary_stats_test +TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer TEST_GEN_PROGS_riscv += coalesced_io_test -TEST_GEN_PROGS_riscv += demand_paging_test -TEST_GEN_PROGS_riscv += dirty_log_test TEST_GEN_PROGS_riscv += get-reg-list -TEST_GEN_PROGS_riscv += guest_print_test -TEST_GEN_PROGS_riscv += kvm_binary_stats_test -TEST_GEN_PROGS_riscv += kvm_create_max_vcpus -TEST_GEN_PROGS_riscv += kvm_page_table_test -TEST_GEN_PROGS_riscv += set_memory_region_test TEST_GEN_PROGS_riscv += steal_time SPLIT_TESTS += arch_timer From f3e555ba45da361f5286b35921c7ca8afbef6384 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 31 Mar 2025 17:05:50 +0200 Subject: [PATCH 11/26] Documentation: KVM: KVM_GET_SUPPORTED_CPUID now exposes TSC_DEADLINE TSC_DEADLINE is now advertised unconditionally by KVM_GET_SUPPORTED_CPUID, since commit 9be4ec35d668 ("KVM: x86: Advertise TSC_DEADLINE_TIMER in KVM_GET_SUPPORTED_CPUID", 2024-12-18). Adjust the documentation to reflect the new behavior. Signed-off-by: Paolo Bonzini Reviewed-by: Sean Christopherson Message-ID: <20250331150550.510320-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1f8625b7646a2..0ec40765ccd4e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -9076,9 +9076,10 @@ the local APIC. The same is true for the ``KVM_FEATURE_PV_UNHALT`` paravirtualized feature. -CPU[EAX=1]:ECX[24] (TSC_DEADLINE) is not reported by ``KVM_GET_SUPPORTED_CPUID``. -It can be enabled if ``KVM_CAP_TSC_DEADLINE_TIMER`` is present and the kernel -has enabled in-kernel emulation of the local APIC. +On older versions of Linux, CPU[EAX=1]:ECX[24] (TSC_DEADLINE) is not reported by +``KVM_GET_SUPPORTED_CPUID``, but it can be enabled if ``KVM_CAP_TSC_DEADLINE_TIMER`` +is present and the kernel has enabled in-kernel emulation of the local APIC. +On newer versions, ``KVM_GET_SUPPORTED_CPUID`` does report the bit as available. CPU topology ~~~~~~~~~~~~ From 26cb30f22f9cb9d964f7ae4b3233c0b9d2cddb2f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 14:50:11 +0200 Subject: [PATCH 12/26] Documentation: kvm: give correct name for KVM_CAP_SPAPR_MULTITCE The capability is incorrectly called KVM_CAP_PPC_MULTITCE in the documentation. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0ec40765ccd4e..d2580cb9ab82f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8847,10 +8847,9 @@ clearing the PVCLOCK_TSC_STABLE_BIT flag in Xen pvclock sources. This will be done when the KVM_CAP_XEN_HVM ioctl sets the KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE flag. -8.31 KVM_CAP_PPC_MULTITCE -------------------------- +8.31 KVM_CAP_SPAPR_MULTITCE +--------------------------- -:Capability: KVM_CAP_PPC_MULTITCE :Architectures: ppc :Type: vm From 2f313018de0fce3f97f6cd49925c8c0cb1a37c67 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 14:50:43 +0200 Subject: [PATCH 13/26] Documentation: kvm: drop "Capability" heading from capabilities It is redundant, and sometimes wrong. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d2580cb9ab82f..eec775f04df60 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7966,7 +7966,6 @@ See Documentation/arch/x86/sgx.rst for more details. 7.26 KVM_CAP_PPC_RPT_INVALIDATE ------------------------------- -:Capability: KVM_CAP_PPC_RPT_INVALIDATE :Architectures: ppc :Type: vm @@ -8041,7 +8040,6 @@ upgrading the VMM process without interrupting the guest. 7.30 KVM_CAP_PPC_AIL_MODE_3 ------------------------------- -:Capability: KVM_CAP_PPC_AIL_MODE_3 :Architectures: ppc :Type: vm @@ -8055,7 +8053,6 @@ handling interrupts and system calls. 7.31 KVM_CAP_DISABLE_QUIRKS2 ---------------------------- -:Capability: KVM_CAP_DISABLE_QUIRKS2 :Parameters: args[0] - set of KVM quirks to disable :Architectures: x86 :Type: vm @@ -8895,7 +8892,6 @@ leaf. 8.34 KVM_CAP_EXIT_HYPERCALL --------------------------- -:Capability: KVM_CAP_EXIT_HYPERCALL :Architectures: x86 :Type: vm @@ -8914,7 +8910,6 @@ ENOSYS for the others. 8.35 KVM_CAP_PMU_CAPABILITY --------------------------- -:Capability: KVM_CAP_PMU_CAPABILITY :Architectures: x86 :Type: vm :Parameters: arg[0] is bitmask of PMU virtualization capabilities. @@ -8936,7 +8931,6 @@ should adjust CPUID leaf 0xA to reflect that the PMU is disabled. 8.36 KVM_CAP_ARM_SYSTEM_SUSPEND ------------------------------- -:Capability: KVM_CAP_ARM_SYSTEM_SUSPEND :Architectures: arm64 :Type: vm @@ -8946,7 +8940,6 @@ type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request. 8.37 KVM_CAP_S390_PROTECTED_DUMP -------------------------------- -:Capability: KVM_CAP_S390_PROTECTED_DUMP :Architectures: s390 :Type: vm @@ -8959,7 +8952,6 @@ available and supports the `KVM_PV_DUMP_CPU` subcommand. 8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES ------------------------------------- -:Capability: KVM_CAP_VM_DISABLE_NX_HUGE_PAGES :Architectures: x86 :Type: vm :Parameters: arg[0] must be 0. @@ -8976,7 +8968,6 @@ This capability may only be set before any vCPUs are created. 8.39 KVM_CAP_S390_CPU_TOPOLOGY ------------------------------ -:Capability: KVM_CAP_S390_CPU_TOPOLOGY :Architectures: s390 :Type: vm @@ -9001,7 +8992,6 @@ must point to a byte where the value will be stored or retrieved from. 8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE --------------------------------------- -:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE :Architectures: arm64 :Type: vm :Parameters: arg[0] is the new split chunk size. @@ -9028,7 +9018,6 @@ block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a 8.41 KVM_CAP_VM_TYPES --------------------- -:Capability: KVM_CAP_MEMORY_ATTRIBUTES :Architectures: x86 :Type: system ioctl From ed7974fd592bc0d3649a42725f5e0b13ea466010 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 14:54:40 +0200 Subject: [PATCH 14/26] Documentation: kvm: fix some definition lists Ensure that they have a ":" in front of the defined item. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eec775f04df60..2d8920d1d594a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7927,10 +7927,10 @@ by POWER10 processor. 7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM ------------------------------------- -Architectures: x86 SEV enabled -Type: vm -Parameters: args[0] is the fd of the source vm -Returns: 0 on success; ENOTTY on error +:Architectures: x86 SEV enabled +:Type: vm +:Parameters: args[0] is the fd of the source vm +:Returns: 0 on success; ENOTTY on error This capability enables userspace to copy encryption context from the vm indicated by the fd to the vm this is called on. @@ -8647,7 +8647,7 @@ limit the attack surface on KVM's MSR emulation code. 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID ------------------------------------- -Architectures: x86 +:Architectures: x86 When enabled, KVM will disable paravirtual features provided to the guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf @@ -8881,7 +8881,7 @@ available to the guest on migration. 8.33 KVM_CAP_HYPERV_ENFORCE_CPUID --------------------------------- -Architectures: x86 +:Architectures: x86 When enabled, KVM will disable emulated Hyper-V features provided to the guest according to the bits Hyper-V CPUID feature leaves. Otherwise, all From af339282e203fb3fa99790c6c48ced3c27cc7bd9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 14:57:39 +0200 Subject: [PATCH 15/26] Documentation: kvm: organize capabilities in the right section Categorize the capabilities correctly. Section 6 is for enabled vCPU capabilities; section 7 is for enabled VM capabilities; section 8 is for informational ones. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 750 +++++++++++++++++---------------- 1 file changed, 376 insertions(+), 374 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 2d8920d1d594a..3a8605f88dc54 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7447,6 +7447,75 @@ Unused bitfields in the bitarrays must be set to zero. This capability connects the vcpu to an in-kernel XIVE device. +6.76 KVM_CAP_HYPERV_SYNIC +------------------------- + +:Architectures: x86 +:Target: vcpu + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that the kernel has an implementation of the +Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is +used to support Windows Hyper-V based guest paravirt drivers(VMBus). + +In order to use SynIC, it has to be activated by setting this +capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this +will disable the use of APIC hardware virtualization even if supported +by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +6.77 KVM_CAP_HYPERV_SYNIC2 +-------------------------- + +:Architectures: x86 +:Target: vcpu + +This capability enables a newer version of Hyper-V Synthetic interrupt +controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM +doesn't clear SynIC message and event flags pages when they are enabled by +writing to the respective MSRs. + +6.78 KVM_CAP_HYPERV_DIRECT_TLBFLUSH +----------------------------------- + +:Architectures: x86 +:Target: vcpu + +This capability indicates that KVM running on top of Hyper-V hypervisor +enables Direct TLB flush for its guests meaning that TLB flush +hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM. +Due to the different ABI for hypercall parameters between Hyper-V and +KVM, enabling this capability effectively disables all hypercall +handling by KVM (as some KVM hypercall may be mistakenly treated as TLB +flush hypercalls by Hyper-V) so userspace should disable KVM identification +in CPUID and only exposes Hyper-V identification. In this case, guest +thinks it's running on Hyper-V and only use Hyper-V hypercalls. + +6.79 KVM_CAP_HYPERV_ENFORCE_CPUID +--------------------------------- + +:Architectures: x86 +:Target: vcpu + +When enabled, KVM will disable emulated Hyper-V features provided to the +guest according to the bits Hyper-V CPUID feature leaves. Otherwise, all +currently implemented Hyper-V features are provided unconditionally when +Hyper-V identification is set in the HYPERV_CPUID_INTERFACE (0x40000001) +leaf. + +6.80 KVM_CAP_ENFORCE_PV_FEATURE_CPUID +------------------------------------- + +:Architectures: x86 +:Target: vcpu + +When enabled, KVM will disable paravirtual features provided to the +guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf +(0x40000001). Otherwise, a guest may use the paravirtual features +regardless of what has actually been exposed through the CPUID leaf. + +.. _KVM_CAP_DIRTY_LOG_RING: + + .. _cap_enable_vm: 7. Capabilities that can be enabled on VMs @@ -7963,23 +8032,6 @@ default. See Documentation/arch/x86/sgx.rst for more details. -7.26 KVM_CAP_PPC_RPT_INVALIDATE -------------------------------- - -:Architectures: ppc -:Type: vm - -This capability indicates that the kernel is capable of handling -H_RPT_INVALIDATE hcall. - -In order to enable the use of H_RPT_INVALIDATE in the guest, -user space might have to advertise it for the guest. For example, -IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is -present in the "ibm,hypertas-functions" device-tree property. - -This capability is enabled for hypervisors on platforms like POWER9 -that support radix MMU. - 7.27 KVM_CAP_EXIT_ON_EMULATION_FAILURE -------------------------------------- @@ -8037,19 +8089,6 @@ indicated by the fd to the VM this is called on. This is intended to support intra-host migration of VMs between userspace VMMs, upgrading the VMM process without interrupting the guest. -7.30 KVM_CAP_PPC_AIL_MODE_3 -------------------------------- - -:Architectures: ppc -:Type: vm - -This capability indicates that the kernel supports the mode 3 setting for the -"Address Translation Mode on Interrupt" aka "Alternate Interrupt Location" -resource that is controlled with the H_SET_MODE hypercall. - -This capability allows a guest kernel to use a better-performance mode for -handling interrupts and system calls. - 7.31 KVM_CAP_DISABLE_QUIRKS2 ---------------------------- @@ -8207,27 +8246,6 @@ This capability is aimed to mitigate the threat that malicious VMs can cause CPU stuck (due to event windows don't open up) and make the CPU unavailable to host or other VMs. -7.34 KVM_CAP_MEMORY_FAULT_INFO ------------------------------- - -:Architectures: x86 -:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. - -The presence of this capability indicates that KVM_RUN will fill -kvm_run.memory_fault if KVM cannot resolve a guest page fault VM-Exit, e.g. if -there is a valid memslot but no backing VMA for the corresponding host virtual -address. - -The information in kvm_run.memory_fault is valid if and only if KVM_RUN returns -an error with errno=EFAULT or errno=EHWPOISON *and* kvm_run.exit_reason is set -to KVM_EXIT_MEMORY_FAULT. - -Note: Userspaces which attempt to resolve memory faults so that they can retry -KVM_RUN are encouraged to guard against repeatedly receiving the same -error/annotated fault. - -See KVM_EXIT_MEMORY_FAULT for more information. - 7.35 KVM_CAP_X86_APIC_BUS_CYCLES_NS ----------------------------------- @@ -8245,86 +8263,272 @@ by KVM_CHECK_EXTENSION. Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest. -7.36 KVM_CAP_X86_GUEST_MODE ------------------------------- - -:Architectures: x86 -:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. - -The presence of this capability indicates that KVM_RUN will update the -KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the -vCPU was executing nested guest code when it exited. +7.36 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL +---------------------------------------------------------- -KVM exits with the register state of either the L1 or L2 guest -depending on which executed at the time of an exit. Userspace must -take care to differentiate between these cases. +:Architectures: x86, arm64 +:Type: vm +:Parameters: args[0] - size of the dirty log ring -7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS -------------------------------------- +KVM is capable of tracking dirty memory using ring buffers that are +mmapped into userspace; there is one dirty ring per vcpu. -:Architectures: arm64 -:Target: VM -:Parameters: None -:Returns: 0 on success, -EINVAL if vCPUs have been created before enabling this - capability. +The dirty ring is available to userspace as an array of +``struct kvm_dirty_gfn``. Each dirty entry is defined as:: -This capability changes the behavior of the registers that identify a PE -implementation of the Arm architecture: MIDR_EL1, REVIDR_EL1, and AIDR_EL1. -By default, these registers are visible to userspace but treated as invariant. + struct kvm_dirty_gfn { + __u32 flags; + __u32 slot; /* as_id | slot_id */ + __u64 offset; + }; -When this capability is enabled, KVM allows userspace to change the -aforementioned registers before the first KVM_RUN. These registers are VM -scoped, meaning that the same set of values are presented on all vCPUs in a -given VM. +The following values are defined for the flags field to define the +current state of the entry:: -8. Other capabilities. -====================== + #define KVM_DIRTY_GFN_F_DIRTY BIT(0) + #define KVM_DIRTY_GFN_F_RESET BIT(1) + #define KVM_DIRTY_GFN_F_MASK 0x3 -This section lists capabilities that give information about other -features of the KVM implementation. +Userspace should call KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM +ioctl to enable this capability for the new guest and set the size of +the rings. Enabling the capability is only allowed before creating any +vCPU, and the size of the ring must be a power of two. The larger the +ring buffer, the less likely the ring is full and the VM is forced to +exit to userspace. The optimal size depends on the workload, but it is +recommended that it be at least 64 KiB (4096 entries). -8.1 KVM_CAP_PPC_HWRNG ---------------------- +Just like for dirty page bitmaps, the buffer tracks writes to +all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was +set in KVM_SET_USER_MEMORY_REGION. Once a memory region is registered +with the flag set, userspace can start harvesting dirty pages from the +ring buffer. -:Architectures: ppc +An entry in the ring buffer can be unused (flag bits ``00``), +dirty (flag bits ``01``) or harvested (flag bits ``1X``). The +state machine for the entry is as follows:: -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that the kernel has an implementation of the -H_RANDOM hypercall backed by a hardware random-number generator. -If present, the kernel H_RANDOM handler can be enabled for guest use -with the KVM_CAP_PPC_ENABLE_HCALL capability. + dirtied harvested reset + 00 -----------> 01 -------------> 1X -------+ + ^ | + | | + +------------------------------------------+ -8.2 KVM_CAP_HYPERV_SYNIC ------------------------- +To harvest the dirty pages, userspace accesses the mmapped ring buffer +to read the dirty GFNs. If the flags has the DIRTY bit set (at this stage +the RESET bit must be cleared), then it means this GFN is a dirty GFN. +The userspace should harvest this GFN and mark the flags from state +``01b`` to ``1Xb`` (bit 0 will be ignored by KVM, but bit 1 must be set +to show that this GFN is harvested and waiting for a reset), and move +on to the next GFN. The userspace should continue to do this until the +flags of a GFN have the DIRTY bit cleared, meaning that it has harvested +all the dirty GFNs that were available. -:Architectures: x86 +Note that on weakly ordered architectures, userspace accesses to the +ring buffer (and more specifically the 'flags' field) must be ordered, +using load-acquire/store-release accessors when available, or any +other memory barrier that will ensure this ordering. -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that the kernel has an implementation of the -Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is -used to support Windows Hyper-V based guest paravirt drivers(VMBus). +It's not necessary for userspace to harvest the all dirty GFNs at once. +However it must collect the dirty GFNs in sequence, i.e., the userspace +program cannot skip one dirty GFN to collect the one next to it. -In order to use SynIC, it has to be activated by setting this -capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this -will disable the use of APIC hardware virtualization even if supported -by the CPU, as it's incompatible with SynIC auto-EOI behavior. +After processing one or more entries in the ring buffer, userspace +calls the VM ioctl KVM_RESET_DIRTY_RINGS to notify the kernel about +it, so that the kernel will reprotect those collected GFNs. +Therefore, the ioctl must be called *before* reading the content of +the dirty pages. -8.3 KVM_CAP_PPC_MMU_RADIX -------------------------- +The dirty ring can get full. When it happens, the KVM_RUN of the +vcpu will return with exit reason KVM_EXIT_DIRTY_LOG_FULL. -:Architectures: ppc +The dirty ring interface has a major difference comparing to the +KVM_GET_DIRTY_LOG interface in that, when reading the dirty ring from +userspace, it's still possible that the kernel has not yet flushed the +processor's dirty page buffers into the kernel buffer (with dirty bitmaps, the +flushing is done by the KVM_GET_DIRTY_LOG ioctl). To achieve that, one +needs to kick the vcpu out of KVM_RUN using a signal. The resulting +vmexit ensures that all dirty GFNs are flushed to the dirty rings. -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that the kernel can support guests using the -radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 -processor). +NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that +should be exposed by weakly ordered architecture, in order to indicate +the additional memory ordering requirements imposed on userspace when +reading the state of an entry and mutating it from DIRTY to HARVESTED. +Architecture with TSO-like ordering (such as x86) are allowed to +expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL +to userspace. -8.4 KVM_CAP_PPC_MMU_HASH_V3 ---------------------------- +After enabling the dirty rings, the userspace needs to detect the +capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the +ring structures can be backed by per-slot bitmaps. With this capability +advertised, it means the architecture can dirty guest pages without +vcpu/ring context, so that some of the dirty information will still be +maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP +can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL +hasn't been enabled, or any memslot has been existing. -:Architectures: ppc +Note that the bitmap here is only a backup of the ring structure. The +use of the ring and bitmap combination is only beneficial if there is +only a very small amount of memory that is dirtied out of vcpu/ring +context. Otherwise, the stand-alone per-slot bitmap mechanism needs to +be considered. -This capability, if KVM_CHECK_EXTENSION indicates that it is +To collect dirty bits in the backup bitmap, userspace can use the same +KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all +the generation of the dirty bits is done in a single pass. Collecting +the dirty bitmap should be the very last thing that the VMM does before +considering the state as complete. VMM needs to ensure that the dirty +state is final and avoid missing dirty pages from another ioctl ordered +after the bitmap collection. + +NOTE: Multiple examples of using the backup bitmap: (1) save vgic/its +tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} on +KVM device "kvm-arm-vgic-its". (2) restore vgic/its tables through +command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device +"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. (3) save +vgic3 pending table through KVM_DEV_ARM_VGIC_{GRP_CTRL, SAVE_PENDING_TABLES} +command on KVM device "kvm-arm-vgic-v3". + +7.37 KVM_CAP_PMU_CAPABILITY +--------------------------- + +:Architectures: x86 +:Type: vm +:Parameters: arg[0] is bitmask of PMU virtualization capabilities. +:Returns: 0 on success, -EINVAL when arg[0] contains invalid bits + +This capability alters PMU virtualization in KVM. + +Calling KVM_CHECK_EXTENSION for this capability returns a bitmask of +PMU virtualization capabilities that can be adjusted on a VM. + +The argument to KVM_ENABLE_CAP is also a bitmask and selects specific +PMU virtualization capabilities to be applied to the VM. This can +only be invoked on a VM prior to the creation of VCPUs. + +At this time, KVM_PMU_CAP_DISABLE is the only capability. Setting +this capability will disable PMU virtualization for that VM. Usermode +should adjust CPUID leaf 0xA to reflect that the PMU is disabled. + +7.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +------------------------------------- + +:Architectures: x86 +:Type: vm +:Parameters: arg[0] must be 0. +:Returns: 0 on success, -EPERM if the userspace process does not + have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been + created. + +This capability disables the NX huge pages mitigation for iTLB MULTIHIT. + +The capability has no effect if the nx_huge_pages module parameter is not set. + +This capability may only be set before any vCPUs are created. + +7.39 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +--------------------------------------- + +:Architectures: arm64 +:Type: vm +:Parameters: arg[0] is the new split chunk size. +:Returns: 0 on success, -EINVAL if any memslot was already created. + +This capability sets the chunk size used in Eager Page Splitting. + +Eager Page Splitting improves the performance of dirty-logging (used +in live migrations) when guest memory is backed by huge-pages. It +avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing +it eagerly when enabling dirty logging (with the +KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using +KVM_CLEAR_DIRTY_LOG. + +The chunk size specifies how many pages to break at a time, using a +single allocation for each chunk. Bigger the chunk size, more pages +need to be allocated ahead of time. + +The chunk size needs to be a valid block size. The list of acceptable +block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a +64-bit bitmap (each bit describing a block size). The default value is +0, to disable the eager page splitting. + +7.40 KVM_CAP_EXIT_HYPERCALL +--------------------------- + +:Architectures: x86 +:Type: vm + +This capability, if enabled, will cause KVM to exit to userspace +with KVM_EXIT_HYPERCALL exit reason to process some hypercalls. + +Calling KVM_CHECK_EXTENSION for this capability will return a bitmask +of hypercalls that can be configured to exit to userspace. +Right now, the only such hypercall is KVM_HC_MAP_GPA_RANGE. + +The argument to KVM_ENABLE_CAP is also a bitmask, and must be a subset +of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace +the hypercalls whose corresponding bit is in the argument, and return +ENOSYS for the others. + +7.41 KVM_CAP_ARM_SYSTEM_SUSPEND +------------------------------- + +:Architectures: arm64 +:Type: vm + +When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of +type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request. + +7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS +------------------------------------- + +:Architectures: arm64 +:Target: VM +:Parameters: None +:Returns: 0 on success, -EINVAL if vCPUs have been created before enabling this + capability. + +This capability changes the behavior of the registers that identify a PE +implementation of the Arm architecture: MIDR_EL1, REVIDR_EL1, and AIDR_EL1. +By default, these registers are visible to userspace but treated as invariant. + +When this capability is enabled, KVM allows userspace to change the +aforementioned registers before the first KVM_RUN. These registers are VM +scoped, meaning that the same set of values are presented on all vCPUs in a +given VM. + +8. Other capabilities. +====================== + +This section lists capabilities that give information about other +features of the KVM implementation. + +8.1 KVM_CAP_PPC_HWRNG +--------------------- + +:Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that the kernel has an implementation of the +H_RANDOM hypercall backed by a hardware random-number generator. +If present, the kernel H_RANDOM handler can be enabled for guest use +with the KVM_CAP_PPC_ENABLE_HCALL capability. + +8.3 KVM_CAP_PPC_MMU_RADIX +------------------------- + +:Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_MMU_HASH_V3 +--------------------------- + +:Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is available, means that the kernel can support guests using the hashed page table MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor), including in-memory segment tables. @@ -8454,16 +8658,6 @@ virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N (counting from the right) is set, then a virtual SMT mode of 2^N is available. -8.11 KVM_CAP_HYPERV_SYNIC2 --------------------------- - -:Architectures: x86 - -This capability enables a newer version of Hyper-V Synthetic interrupt -controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM -doesn't clear SynIC message and event flags pages when they are enabled by -writing to the respective MSRs. - 8.12 KVM_CAP_HYPERV_VP_INDEX ---------------------------- @@ -8478,7 +8672,6 @@ capability is absent, userspace can still query this msr's value. ------------------------------- :Architectures: s390 -:Parameters: none This capability indicates if the flic device will be able to get/set the AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows @@ -8552,21 +8745,6 @@ This capability indicates that KVM supports paravirtualized Hyper-V IPI send hypercalls: HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. -8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH ------------------------------------ - -:Architectures: x86 - -This capability indicates that KVM running on top of Hyper-V hypervisor -enables Direct TLB flush for its guests meaning that TLB flush -hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM. -Due to the different ABI for hypercall parameters between Hyper-V and -KVM, enabling this capability effectively disables all hypercall -handling by KVM (as some KVM hypercall may be mistakenly treated as TLB -flush hypercalls by Hyper-V) so userspace should disable KVM identification -in CPUID and only exposes Hyper-V identification. In this case, guest -thinks it's running on Hyper-V and only use Hyper-V hypercalls. - 8.22 KVM_CAP_S390_VCPU_RESETS ----------------------------- @@ -8644,142 +8822,6 @@ In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. -8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID -------------------------------------- - -:Architectures: x86 - -When enabled, KVM will disable paravirtual features provided to the -guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf -(0x40000001). Otherwise, a guest may use the paravirtual features -regardless of what has actually been exposed through the CPUID leaf. - -.. _KVM_CAP_DIRTY_LOG_RING: - -8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL ----------------------------------------------------------- - -:Architectures: x86, arm64 -:Parameters: args[0] - size of the dirty log ring - -KVM is capable of tracking dirty memory using ring buffers that are -mmapped into userspace; there is one dirty ring per vcpu. - -The dirty ring is available to userspace as an array of -``struct kvm_dirty_gfn``. Each dirty entry is defined as:: - - struct kvm_dirty_gfn { - __u32 flags; - __u32 slot; /* as_id | slot_id */ - __u64 offset; - }; - -The following values are defined for the flags field to define the -current state of the entry:: - - #define KVM_DIRTY_GFN_F_DIRTY BIT(0) - #define KVM_DIRTY_GFN_F_RESET BIT(1) - #define KVM_DIRTY_GFN_F_MASK 0x3 - -Userspace should call KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM -ioctl to enable this capability for the new guest and set the size of -the rings. Enabling the capability is only allowed before creating any -vCPU, and the size of the ring must be a power of two. The larger the -ring buffer, the less likely the ring is full and the VM is forced to -exit to userspace. The optimal size depends on the workload, but it is -recommended that it be at least 64 KiB (4096 entries). - -Just like for dirty page bitmaps, the buffer tracks writes to -all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was -set in KVM_SET_USER_MEMORY_REGION. Once a memory region is registered -with the flag set, userspace can start harvesting dirty pages from the -ring buffer. - -An entry in the ring buffer can be unused (flag bits ``00``), -dirty (flag bits ``01``) or harvested (flag bits ``1X``). The -state machine for the entry is as follows:: - - dirtied harvested reset - 00 -----------> 01 -------------> 1X -------+ - ^ | - | | - +------------------------------------------+ - -To harvest the dirty pages, userspace accesses the mmapped ring buffer -to read the dirty GFNs. If the flags has the DIRTY bit set (at this stage -the RESET bit must be cleared), then it means this GFN is a dirty GFN. -The userspace should harvest this GFN and mark the flags from state -``01b`` to ``1Xb`` (bit 0 will be ignored by KVM, but bit 1 must be set -to show that this GFN is harvested and waiting for a reset), and move -on to the next GFN. The userspace should continue to do this until the -flags of a GFN have the DIRTY bit cleared, meaning that it has harvested -all the dirty GFNs that were available. - -Note that on weakly ordered architectures, userspace accesses to the -ring buffer (and more specifically the 'flags' field) must be ordered, -using load-acquire/store-release accessors when available, or any -other memory barrier that will ensure this ordering. - -It's not necessary for userspace to harvest the all dirty GFNs at once. -However it must collect the dirty GFNs in sequence, i.e., the userspace -program cannot skip one dirty GFN to collect the one next to it. - -After processing one or more entries in the ring buffer, userspace -calls the VM ioctl KVM_RESET_DIRTY_RINGS to notify the kernel about -it, so that the kernel will reprotect those collected GFNs. -Therefore, the ioctl must be called *before* reading the content of -the dirty pages. - -The dirty ring can get full. When it happens, the KVM_RUN of the -vcpu will return with exit reason KVM_EXIT_DIRTY_LOG_FULL. - -The dirty ring interface has a major difference comparing to the -KVM_GET_DIRTY_LOG interface in that, when reading the dirty ring from -userspace, it's still possible that the kernel has not yet flushed the -processor's dirty page buffers into the kernel buffer (with dirty bitmaps, the -flushing is done by the KVM_GET_DIRTY_LOG ioctl). To achieve that, one -needs to kick the vcpu out of KVM_RUN using a signal. The resulting -vmexit ensures that all dirty GFNs are flushed to the dirty rings. - -NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that -should be exposed by weakly ordered architecture, in order to indicate -the additional memory ordering requirements imposed on userspace when -reading the state of an entry and mutating it from DIRTY to HARVESTED. -Architecture with TSO-like ordering (such as x86) are allowed to -expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL -to userspace. - -After enabling the dirty rings, the userspace needs to detect the -capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the -ring structures can be backed by per-slot bitmaps. With this capability -advertised, it means the architecture can dirty guest pages without -vcpu/ring context, so that some of the dirty information will still be -maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP -can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL -hasn't been enabled, or any memslot has been existing. - -Note that the bitmap here is only a backup of the ring structure. The -use of the ring and bitmap combination is only beneficial if there is -only a very small amount of memory that is dirtied out of vcpu/ring -context. Otherwise, the stand-alone per-slot bitmap mechanism needs to -be considered. - -To collect dirty bits in the backup bitmap, userspace can use the same -KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all -the generation of the dirty bits is done in a single pass. Collecting -the dirty bitmap should be the very last thing that the VMM does before -considering the state as complete. VMM needs to ensure that the dirty -state is final and avoid missing dirty pages from another ioctl ordered -after the bitmap collection. - -NOTE: Multiple examples of using the backup bitmap: (1) save vgic/its -tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} on -KVM device "kvm-arm-vgic-its". (2) restore vgic/its tables through -command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device -"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. (3) save -vgic3 pending table through KVM_DEV_ARM_VGIC_{GRP_CTRL, SAVE_PENDING_TABLES} -command on KVM device "kvm-arm-vgic-v3". - 8.30 KVM_CAP_XEN_HVM -------------------- @@ -8878,65 +8920,6 @@ This capability indicates that the KVM virtual PTP service is supported in the host. A VMM can check whether the service is available to the guest on migration. -8.33 KVM_CAP_HYPERV_ENFORCE_CPUID ---------------------------------- - -:Architectures: x86 - -When enabled, KVM will disable emulated Hyper-V features provided to the -guest according to the bits Hyper-V CPUID feature leaves. Otherwise, all -currently implemented Hyper-V features are provided unconditionally when -Hyper-V identification is set in the HYPERV_CPUID_INTERFACE (0x40000001) -leaf. - -8.34 KVM_CAP_EXIT_HYPERCALL ---------------------------- - -:Architectures: x86 -:Type: vm - -This capability, if enabled, will cause KVM to exit to userspace -with KVM_EXIT_HYPERCALL exit reason to process some hypercalls. - -Calling KVM_CHECK_EXTENSION for this capability will return a bitmask -of hypercalls that can be configured to exit to userspace. -Right now, the only such hypercall is KVM_HC_MAP_GPA_RANGE. - -The argument to KVM_ENABLE_CAP is also a bitmask, and must be a subset -of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace -the hypercalls whose corresponding bit is in the argument, and return -ENOSYS for the others. - -8.35 KVM_CAP_PMU_CAPABILITY ---------------------------- - -:Architectures: x86 -:Type: vm -:Parameters: arg[0] is bitmask of PMU virtualization capabilities. -:Returns: 0 on success, -EINVAL when arg[0] contains invalid bits - -This capability alters PMU virtualization in KVM. - -Calling KVM_CHECK_EXTENSION for this capability returns a bitmask of -PMU virtualization capabilities that can be adjusted on a VM. - -The argument to KVM_ENABLE_CAP is also a bitmask and selects specific -PMU virtualization capabilities to be applied to the VM. This can -only be invoked on a VM prior to the creation of VCPUs. - -At this time, KVM_PMU_CAP_DISABLE is the only capability. Setting -this capability will disable PMU virtualization for that VM. Usermode -should adjust CPUID leaf 0xA to reflect that the PMU is disabled. - -8.36 KVM_CAP_ARM_SYSTEM_SUSPEND -------------------------------- - -:Architectures: arm64 -:Type: vm - -When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of -type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request. - 8.37 KVM_CAP_S390_PROTECTED_DUMP -------------------------------- @@ -8949,22 +8932,6 @@ PV guests. The `KVM_PV_DUMP` command is available for the dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is available and supports the `KVM_PV_DUMP_CPU` subcommand. -8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES -------------------------------------- - -:Architectures: x86 -:Type: vm -:Parameters: arg[0] must be 0. -:Returns: 0 on success, -EPERM if the userspace process does not - have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been - created. - -This capability disables the NX huge pages mitigation for iTLB MULTIHIT. - -The capability has no effect if the nx_huge_pages module parameter is not set. - -This capability may only be set before any vCPUs are created. - 8.39 KVM_CAP_S390_CPU_TOPOLOGY ------------------------------ @@ -8989,32 +8956,6 @@ structure. When getting the Modified Change Topology Report value, the attr->addr must point to a byte where the value will be stored or retrieved from. -8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE ---------------------------------------- - -:Architectures: arm64 -:Type: vm -:Parameters: arg[0] is the new split chunk size. -:Returns: 0 on success, -EINVAL if any memslot was already created. - -This capability sets the chunk size used in Eager Page Splitting. - -Eager Page Splitting improves the performance of dirty-logging (used -in live migrations) when guest memory is backed by huge-pages. It -avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing -it eagerly when enabling dirty logging (with the -KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using -KVM_CLEAR_DIRTY_LOG. - -The chunk size specifies how many pages to break at a time, using a -single allocation for each chunk. Bigger the chunk size, more pages -need to be allocated ahead of time. - -The chunk size needs to be a valid block size. The list of acceptable -block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a -64-bit bitmap (each bit describing a block size). The default value is -0, to disable the eager page splitting. - 8.41 KVM_CAP_VM_TYPES --------------------- @@ -9034,6 +8975,67 @@ Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in production. The behavior and effective ABI for software-protected VMs is unstable. +8.42 KVM_CAP_PPC_RPT_INVALIDATE +------------------------------- + +:Architectures: ppc + +This capability indicates that the kernel is capable of handling +H_RPT_INVALIDATE hcall. + +In order to enable the use of H_RPT_INVALIDATE in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is +present in the "ibm,hypertas-functions" device-tree property. + +This capability is enabled for hypervisors on platforms like POWER9 +that support radix MMU. + +8.43 KVM_CAP_PPC_AIL_MODE_3 +--------------------------- + +:Architectures: ppc + +This capability indicates that the kernel supports the mode 3 setting for the +"Address Translation Mode on Interrupt" aka "Alternate Interrupt Location" +resource that is controlled with the H_SET_MODE hypercall. + +This capability allows a guest kernel to use a better-performance mode for +handling interrupts and system calls. + +8.44 KVM_CAP_MEMORY_FAULT_INFO +------------------------------ + +:Architectures: x86 + +The presence of this capability indicates that KVM_RUN will fill +kvm_run.memory_fault if KVM cannot resolve a guest page fault VM-Exit, e.g. if +there is a valid memslot but no backing VMA for the corresponding host virtual +address. + +The information in kvm_run.memory_fault is valid if and only if KVM_RUN returns +an error with errno=EFAULT or errno=EHWPOISON *and* kvm_run.exit_reason is set +to KVM_EXIT_MEMORY_FAULT. + +Note: Userspaces which attempt to resolve memory faults so that they can retry +KVM_RUN are encouraged to guard against repeatedly receiving the same +error/annotated fault. + +See KVM_EXIT_MEMORY_FAULT for more information. + +8.45 KVM_CAP_X86_GUEST_MODE +--------------------------- + +:Architectures: x86 + +The presence of this capability indicates that KVM_RUN will update the +KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the +vCPU was executing nested guest code when it exited. + +KVM exits with the register state of either the L1 or L2 guest +depending on which executed at the time of an exit. Userspace must +take care to differentiate between these cases. + 9. Known KVM API problems ========================= From 269a2c3663c6d19526347e7c537b36e74e4df3e6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2025 15:04:06 +0200 Subject: [PATCH 16/26] Documentation: kvm: remove KVM_CAP_MIPS_TE Trap and emulate virtualization is not available anymore for MIPS. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3a8605f88dc54..47c7c3f92314e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8563,20 +8563,6 @@ may be incompatible with the MIPS VZ ASE. virtualization, including standard guest virtual memory segments. == ========================================================================== -8.6 KVM_CAP_MIPS_TE -------------------- - -:Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that the trap & emulate implementation is available to -run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware -assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed -to KVM_CREATE_VM to create a VM which utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using trap & emulate. - 8.7 KVM_CAP_MIPS_64BIT ---------------------- From ef01cac401f18647d62720cf773d7bb0541827da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 08:05:04 -0700 Subject: [PATCH 17/26] KVM: x86: Acquire SRCU in KVM_GET_MP_STATE to protect guest memory accesses Acquire a lock on kvm->srcu when userspace is getting MP state to handle a rather extreme edge case where "accepting" APIC events, i.e. processing pending INIT or SIPI, can trigger accesses to guest memory. If the vCPU is in L2 with INIT *and* a TRIPLE_FAULT request pending, then getting MP state will trigger a nested VM-Exit by way of ->check_nested_events(), and emuating the nested VM-Exit can access guest memory. The splat was originally hit by syzkaller on a Google-internal kernel, and reproduced on an upstream kernel by hacking the triple_fault_event_test selftest to stuff a pending INIT, store an MSR on VM-Exit (to generate a memory access on VMX), and do vcpu_mp_state_get() to trigger the scenario. ============================= WARNING: suspicious RCU usage 6.14.0-rc3-b112d356288b-vmx/pi_lockdep_false_pos-lock #3 Not tainted ----------------------------- include/linux/kvm_host.h:1058 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by triple_fault_ev/1256: #0: ffff88810df5a330 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0x8b/0x9a0 [kvm] stack backtrace: CPU: 11 UID: 1000 PID: 1256 Comm: triple_fault_ev Not tainted 6.14.0-rc3-b112d356288b-vmx #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x7f/0x90 lockdep_rcu_suspicious+0x144/0x190 kvm_vcpu_gfn_to_memslot+0x156/0x180 [kvm] kvm_vcpu_read_guest+0x3e/0x90 [kvm] read_and_check_msr_entry+0x2e/0x180 [kvm_intel] __nested_vmx_vmexit+0x550/0xde0 [kvm_intel] kvm_check_nested_events+0x1b/0x30 [kvm] kvm_apic_accept_events+0x33/0x100 [kvm] kvm_arch_vcpu_ioctl_get_mpstate+0x30/0x1d0 [kvm] kvm_vcpu_ioctl+0x33e/0x9a0 [kvm] __x64_sys_ioctl+0x8b/0xb0 do_syscall_64+0x6c/0x170 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250401150504.829812-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c841817a914a3..3712dde0bf9d1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11786,6 +11786,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11799,6 +11801,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); From 0297cdc12a87629ad904ac8c0630f7702f9a2d48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 07:22:38 -0700 Subject: [PATCH 18/26] KVM: selftests: Add option to rseq test to override /dev/cpu_dma_latency Add a "-l " param to the rseq test so that the user can override /dev/cpu_dma_latency, as described by the test's suggested workaround for not being able to complete enough migrations. cpu_dma_latency is not a normal file, even as far as procfs files go. Writes to cpu_dma_latency only persist so long as the file is open, e.g. so that the kernel automatically reverts back to a power-optimized state once the sensitive workload completes. Provide the necessary functionality instead of effectively forcing the user to write a non-obvious wrapper. Cc: Dongsheng Zhang Cc: Zide Chen Signed-off-by: Sean Christopherson Message-ID: <20250401142238.819487-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/rseq_test.c | 31 ++++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index e5898678bfab4..1375fca80bcdb 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -196,25 +196,27 @@ static void calc_min_max_cpu(void) static void help(const char *name) { puts(""); - printf("usage: %s [-h] [-u]\n", name); + printf("usage: %s [-h] [-u] [-l latency]\n", name); printf(" -u: Don't sanity check the number of successful KVM_RUNs\n"); + printf(" -l: Set /dev/cpu_dma_latency to suppress deep sleep states\n"); puts(""); exit(0); } int main(int argc, char *argv[]) { + int r, i, snapshot, opt, fd = -1, latency = -1; bool skip_sanity_check = false; - int r, i, snapshot; struct kvm_vm *vm; struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; - int opt; - while ((opt = getopt(argc, argv, "hu")) != -1) { + while ((opt = getopt(argc, argv, "hl:u")) != -1) { switch (opt) { case 'u': skip_sanity_check = true; + case 'l': + latency = atoi_paranoid(optarg); break; case 'h': default: @@ -243,6 +245,20 @@ int main(int argc, char *argv[]) pthread_create(&migration_thread, NULL, migration_worker, (void *)(unsigned long)syscall(SYS_gettid)); + if (latency >= 0) { + /* + * Writes to cpu_dma_latency persist only while the file is + * open, i.e. it allows userspace to provide guaranteed latency + * while running a workload. Keep the file open until the test + * completes, otherwise writing cpu_dma_latency is meaningless. + */ + fd = open("/dev/cpu_dma_latency", O_RDWR); + TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("open() /dev/cpu_dma_latency", fd)); + + r = write(fd, &latency, 4); + TEST_ASSERT(r >= 1, "Error setting /dev/cpu_dma_latency"); + } + for (i = 0; !done; i++) { vcpu_run(vcpu); TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, @@ -278,6 +294,9 @@ int main(int argc, char *argv[]) "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu); } + if (fd > 0) + close(fd); + /* * Sanity check that the test was able to enter the guest a reasonable * number of times, e.g. didn't get stalled too often/long waiting for @@ -293,8 +312,8 @@ int main(int argc, char *argv[]) TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2), "Only performed %d KVM_RUNs, task stalled too much?\n\n" " Try disabling deep sleep states to reduce CPU wakeup latency,\n" - " e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n" - " or run with -u to disable this sanity check.", i); + " e.g. via cpuidle.off=1 or via -l , or run with -u to\n" + " disable this sanity check.", i); pthread_join(migration_thread, NULL); From 81d480fdf8b7d9b13fe87e1f0516f89794708094 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:34:48 -0700 Subject: [PATCH 19/26] KVM: x86/mmu: Wrap sanity check on number of TDP MMU pages with KVM_PROVE_MMU Wrap the TDP MMU page counter in CONFIG_KVM_PROVE_MMU so that the sanity check is omitted from production builds, and more importantly to remove the atomic accesses to account pages. A one-off memory leak in production is relatively uninteresting, and a WARN_ON won't help mitigate a systemic issue; it's as much about helping triage memory leaks as it is about detecting them in the first place, and doesn't magically stop the leaks. I.e. production environments will be quite sad if a severe KVM bug escapes, regardless of whether or not KVM WARNs. Signed-off-by: Sean Christopherson Message-ID: <20250315023448.2358456-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/mmu/tdp_mmu.c | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a884ab544335e..3bdae454a9597 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1472,8 +1472,13 @@ struct kvm_arch { struct once nx_once; #ifdef CONFIG_X86_64 - /* The number of TDP MMU pages across all roots. */ +#ifdef CONFIG_KVM_PROVE_MMU + /* + * The number of TDP MMU pages across all roots. Used only to sanity + * check that KVM isn't leaking TDP MMU pages. + */ atomic64_t tdp_mmu_pages; +#endif /* * List of struct kvm_mmu_pages being used as roots. diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7cc0564f5f97e..21a3b81662423 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -325,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** From 459a35111b0a890172a78d51c01b204e13a34a18 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:46:23 -0700 Subject: [PATCH 20/26] KVM: Allow building irqbypass.ko as as module when kvm.ko is a module Convert HAVE_KVM_IRQ_BYPASS into a tristate so that selecting IRQ_BYPASS_MANAGER follows KVM={m,y}, i.e. doesn't force irqbypass.ko to be built-in. Note, PPC allows building KVM as a module, but selects HAVE_KVM_IRQ_BYPASS from a boolean Kconfig, i.e. KVM PPC unnecessarily forces irqbpass.ko to be built-in. But that flaw is a longstanding PPC specific issue. Fixes: 61df71ee992d ("kvm: move "select IRQ_BYPASS_MANAGER" to common code") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250315024623.2363994-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/Kconfig | 2 +- virt/kvm/eventfd.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5438a1b446a6b..291d49b9bf054 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2382,7 +2382,7 @@ static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 746e1f466aa64..727b542074e7e 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -75,7 +75,7 @@ config KVM_COMPAT depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) config HAVE_KVM_IRQ_BYPASS - bool + tristate select IRQ_BYPASS_MANAGER config HAVE_KVM_VCPU_ASYNC_IOCTL diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 249ba5b72e9b0..11e5d1e3f12ea 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -149,7 +149,7 @@ irqfd_shutdown(struct work_struct *work) /* * It is now safe to release the object's resources */ -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) irq_bypass_unregister_consumer(&irqfd->consumer); #endif eventfd_ctx_put(irqfd->eventfd); @@ -274,7 +274,7 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) write_seqcount_end(&irqfd->irq_entry_sc); } -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) void __attribute__((weak)) kvm_arch_irq_bypass_stop( struct irq_bypass_consumer *cons) { @@ -424,7 +424,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) if (events & EPOLLIN) schedule_work(&irqfd->inject); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; @@ -609,14 +609,14 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry(irqfd, &kvm->irqfds.items, list) { -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) /* Under irqfds.lock, so can read irq_entry safely */ struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; #endif irqfd_update(kvm, irqfd); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { int ret = kvm_arch_update_irqfd_routing( From bc52ae0a708cb6fa3926d11c88e3c55e1171b4a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:41:02 -0700 Subject: [PATCH 21/26] KVM: x86: Explicitly zero-initialize on-stack CPUID unions Explicitly zero/empty-initialize the unions used for PMU related CPUID entries, instead of manually zeroing all fields (hopefully), or in the case of 0x80000022, relying on the compiler to clobber the uninitialized bitfields. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-ID: <20250315024102.2361628-1-seanjc@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5e4d4934c0d3c..571c906ffcbfe 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1427,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1444,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1763,7 +1761,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { From 6bad6ecc63b75af294ff3f56f54d6b857c8964a5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 08:47:26 -0700 Subject: [PATCH 22/26] KVM: VMX: Assert that IRQs are disabled when putting vCPU on PI wakeup list Assert that IRQs are already disabled when putting a vCPU on a CPU's PI wakeup list, as opposed to saving/disabling+restoring IRQs. KVM relies on IRQs being disabled until the vCPU task is fully scheduled out, i.e. until the scheduler has dropped all of its per-CPU locks (e.g. for the runqueue), as attempting to wake the task while it's being scheduled out could lead to deadlock. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: Yan Zhao Message-ID: <20250401154727.835231-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43c..840d435229a87 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -148,9 +148,8 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); list_add_tail(&vmx->pi_wakeup_list, @@ -176,8 +175,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) From c0b8dcabb2cddc98c265548632c39e97422f61b6 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Tue, 1 Apr 2025 08:47:27 -0700 Subject: [PATCH 23/26] KVM: VMX: Use separate subclasses for PI wakeup lock to squash false positive Use a separate subclass when acquiring KVM's per-CPU posted interrupts wakeup lock in the scheduled out path, i.e. when adding a vCPU on the list of vCPUs to wake, to workaround a false positive deadlock. Chain exists of: &p->pi_lock --> &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); lock(&rq->__lock); lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); lock(&p->pi_lock); *** DEADLOCK *** In the wakeup handler, the callchain is *always*: sysvec_kvm_posted_intr_wakeup_ipi() | --> pi_wakeup_handler() | --> kvm_vcpu_wake_up() | --> try_to_wake_up(), and the lock order is: &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) --> &p->pi_lock. For the schedule out path, the callchain is always (for all intents and purposes; if the kernel is preemptible, kvm_sched_out() can be called from something other than schedule(), but the beginning of the callchain will be the same point in vcpu_block()): vcpu_block() | --> schedule() | --> kvm_sched_out() | --> vmx_vcpu_put() | --> vmx_vcpu_pi_put() | --> pi_enable_wakeup_handler() and the lock order is: &rq->__lock --> &per_cpu(wakeup_vcpus_on_cpu_lock, cpu) I.e. lockdep sees AB+BC ordering for schedule out, and CA ordering for wakeup, and complains about the A=>C versus C=>A inversion. In practice, deadlock can't occur between schedule out and the wakeup handler as they are mutually exclusive. The entirely of the schedule out code that runs with the problematic scheduler locks held, does so with IRQs disabled, i.e. can't run concurrently with the wakeup handler. Use a subclass instead disabling lockdep entirely, and tell lockdep that both subclasses are being acquired when loading a vCPU, as the sched_out and sched_in paths are NOT mutually exclusive, e.g. CPU 0 CPU 1 --------------- --------------- vCPU0 sched_out vCPU1 sched_in vCPU1 sched_out vCPU 0 sched_in where vCPU0's sched_in may race with vCPU1's sched_out, on CPU 0's wakeup list+lock. Signed-off-by: Yan Zhao Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-ID: <20250401154727.835231-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 840d435229a87..51116fe69a500 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vmx(vcpu)->pi_desc); @@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -151,7 +164,20 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); list_add_tail(&vmx->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); From d8d78398e550039295e0237eafb703e2d21f7d57 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Sat, 5 Apr 2025 00:10:41 +0000 Subject: [PATCH 24/26] KVM: arm64: selftests: Introduce and use hardware-definition macros The kvm selftest library for arm64 currently configures the hardware fields, such as shift and mask in the page-table entries and registers, directly with numbers. While it add comments at places, it's better to rewrite them with appropriate macros to improve the readability and reduce the risk of errors. Hence, introduce macros to define the hardware fields and use them in the arm64 processor library. Most of the definitions are primary copied from the Linux's header, arch/arm64/include/asm/pgtable-hwdef.h. No functional change intended. Suggested-by: Oliver Upton Signed-off-by: Raghavendra Rao Ananta Link: https://lore.kernel.org/r/20250405001042.1470552-2-rananta@google.com Signed-off-by: Oliver Upton --- .../selftests/kvm/arm64/page_fault_test.c | 2 +- .../selftests/kvm/include/arm64/processor.h | 66 +++++++++++++++++-- .../selftests/kvm/lib/arm64/processor.c | 57 ++++++++-------- 3 files changed, 92 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/page_fault_test.c b/tools/testing/selftests/kvm/arm64/page_fault_test.c index ec33a8f9c908c..dc6559dad9d86 100644 --- a/tools/testing/selftests/kvm/arm64/page_fault_test.c +++ b/tools/testing/selftests/kvm/arm64/page_fault_test.c @@ -199,7 +199,7 @@ static bool guest_set_ha(void) if (hadbs == 0) return false; - tcr = read_sysreg(tcr_el1) | TCR_EL1_HA; + tcr = read_sysreg(tcr_el1) | TCR_HA; write_sysreg(tcr, tcr_el1); isb(); diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index 1e8d0d531fbd3..7d88ff22013ab 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -62,6 +62,66 @@ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) +/* TCR_EL1 specific flags */ +#define TCR_T0SZ_OFFSET 0 +#define TCR_T0SZ(x) ((UL(64) - (x)) << TCR_T0SZ_OFFSET) + +#define TCR_IRGN0_SHIFT 8 +#define TCR_IRGN0_MASK (UL(3) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_NC (UL(0) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBWA (UL(1) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WT (UL(2) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBnWA (UL(3) << TCR_IRGN0_SHIFT) + +#define TCR_ORGN0_SHIFT 10 +#define TCR_ORGN0_MASK (UL(3) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_NC (UL(0) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBWA (UL(1) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WT (UL(2) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBnWA (UL(3) << TCR_ORGN0_SHIFT) + +#define TCR_SH0_SHIFT 12 +#define TCR_SH0_MASK (UL(3) << TCR_SH0_SHIFT) +#define TCR_SH0_INNER (UL(3) << TCR_SH0_SHIFT) + +#define TCR_TG0_SHIFT 14 +#define TCR_TG0_MASK (UL(3) << TCR_TG0_SHIFT) +#define TCR_TG0_4K (UL(0) << TCR_TG0_SHIFT) +#define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) +#define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) + +#define TCR_IPS_SHIFT 32 +#define TCR_IPS_MASK (UL(7) << TCR_IPS_SHIFT) +#define TCR_IPS_52_BITS (UL(6) << TCR_IPS_SHIFT) +#define TCR_IPS_48_BITS (UL(5) << TCR_IPS_SHIFT) +#define TCR_IPS_40_BITS (UL(2) << TCR_IPS_SHIFT) +#define TCR_IPS_36_BITS (UL(1) << TCR_IPS_SHIFT) + +#define TCR_HA (UL(1) << 39) +#define TCR_DS (UL(1) << 59) + +/* + * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers). + */ +#define PTE_ATTRINDX(t) ((t) << 2) +#define PTE_ATTRINDX_MASK GENMASK(4, 2) +#define PTE_ATTRINDX_SHIFT 2 + +#define PTE_VALID BIT(0) +#define PGD_TYPE_TABLE BIT(1) +#define PUD_TYPE_TABLE BIT(1) +#define PMD_TYPE_TABLE BIT(1) +#define PTE_TYPE_PAGE BIT(1) + +#define PTE_AF BIT(10) + +#define PTE_ADDR_MASK(page_shift) GENMASK(47, (page_shift)) +#define PTE_ADDR_51_48 GENMASK(15, 12) +#define PTE_ADDR_51_48_SHIFT 12 +#define PTE_ADDR_MASK_LPA2(page_shift) GENMASK(49, (page_shift)) +#define PTE_ADDR_51_50_LPA2 GENMASK(9, 8) +#define PTE_ADDR_51_50_LPA2_SHIFT 8 + void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init); struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code); @@ -102,12 +162,6 @@ enum { (v) == VECTOR_SYNC_LOWER_64 || \ (v) == VECTOR_SYNC_LOWER_32) -/* Access flag */ -#define PTE_AF (1ULL << 10) - -/* Access flag update enable/disable */ -#define TCR_EL1_HA (1ULL << 39) - void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, uint32_t *ipa16k, uint32_t *ipa64k); diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 7ba3aa3755f35..da5802c8a59c1 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -72,13 +72,13 @@ static uint64_t addr_pte(struct kvm_vm *vm, uint64_t pa, uint64_t attrs) uint64_t pte; if (use_lpa2_pte_format(vm)) { - pte = pa & GENMASK(49, vm->page_shift); - pte |= FIELD_GET(GENMASK(51, 50), pa) << 8; - attrs &= ~GENMASK(9, 8); + pte = pa & PTE_ADDR_MASK_LPA2(vm->page_shift); + pte |= FIELD_GET(GENMASK(51, 50), pa) << PTE_ADDR_51_50_LPA2_SHIFT; + attrs &= ~PTE_ADDR_51_50_LPA2; } else { - pte = pa & GENMASK(47, vm->page_shift); + pte = pa & PTE_ADDR_MASK(vm->page_shift); if (vm->page_shift == 16) - pte |= FIELD_GET(GENMASK(51, 48), pa) << 12; + pte |= FIELD_GET(GENMASK(51, 48), pa) << PTE_ADDR_51_48_SHIFT; } pte |= attrs; @@ -90,12 +90,12 @@ static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte) uint64_t pa; if (use_lpa2_pte_format(vm)) { - pa = pte & GENMASK(49, vm->page_shift); - pa |= FIELD_GET(GENMASK(9, 8), pte) << 50; + pa = pte & PTE_ADDR_MASK_LPA2(vm->page_shift); + pa |= FIELD_GET(PTE_ADDR_51_50_LPA2, pte) << 50; } else { - pa = pte & GENMASK(47, vm->page_shift); + pa = pte & PTE_ADDR_MASK(vm->page_shift); if (vm->page_shift == 16) - pa |= FIELD_GET(GENMASK(15, 12), pte) << 48; + pa |= FIELD_GET(PTE_ADDR_51_48, pte) << 48; } return pa; @@ -128,7 +128,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t flags) { - uint8_t attr_idx = flags & 7; + uint8_t attr_idx = flags & (PTE_ATTRINDX_MASK >> PTE_ATTRINDX_SHIFT); + uint64_t pg_attr; uint64_t *ptep; TEST_ASSERT((vaddr % vm->page_size) == 0, @@ -147,18 +148,21 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; if (!*ptep) - *ptep = addr_pte(vm, vm_alloc_page_table(vm), 3); + *ptep = addr_pte(vm, vm_alloc_page_table(vm), + PGD_TYPE_TABLE | PTE_VALID); switch (vm->pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; if (!*ptep) - *ptep = addr_pte(vm, vm_alloc_page_table(vm), 3); + *ptep = addr_pte(vm, vm_alloc_page_table(vm), + PUD_TYPE_TABLE | PTE_VALID); /* fall through */ case 3: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; if (!*ptep) - *ptep = addr_pte(vm, vm_alloc_page_table(vm), 3); + *ptep = addr_pte(vm, vm_alloc_page_table(vm), + PMD_TYPE_TABLE | PTE_VALID); /* fall through */ case 2: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; @@ -167,7 +171,8 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, TEST_FAIL("Page table levels must be 2, 3, or 4"); } - *ptep = addr_pte(vm, paddr, (attr_idx << 2) | (1 << 10) | 3); /* AF */ + pg_attr = PTE_AF | PTE_ATTRINDX(attr_idx) | PTE_TYPE_PAGE | PTE_VALID; + *ptep = addr_pte(vm, paddr, pg_attr); } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -293,20 +298,20 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) case VM_MODE_P48V48_64K: case VM_MODE_P40V48_64K: case VM_MODE_P36V48_64K: - tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ + tcr_el1 |= TCR_TG0_64K; break; case VM_MODE_P52V48_16K: case VM_MODE_P48V48_16K: case VM_MODE_P40V48_16K: case VM_MODE_P36V48_16K: case VM_MODE_P36V47_16K: - tcr_el1 |= 2ul << 14; /* TG0 = 16KB */ + tcr_el1 |= TCR_TG0_16K; break; case VM_MODE_P52V48_4K: case VM_MODE_P48V48_4K: case VM_MODE_P40V48_4K: case VM_MODE_P36V48_4K: - tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ + tcr_el1 |= TCR_TG0_4K; break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); @@ -319,35 +324,35 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) case VM_MODE_P52V48_4K: case VM_MODE_P52V48_16K: case VM_MODE_P52V48_64K: - tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + tcr_el1 |= TCR_IPS_52_BITS; ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2; break; case VM_MODE_P48V48_4K: case VM_MODE_P48V48_16K: case VM_MODE_P48V48_64K: - tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ + tcr_el1 |= TCR_IPS_48_BITS; break; case VM_MODE_P40V48_4K: case VM_MODE_P40V48_16K: case VM_MODE_P40V48_64K: - tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ + tcr_el1 |= TCR_IPS_40_BITS; break; case VM_MODE_P36V48_4K: case VM_MODE_P36V48_16K: case VM_MODE_P36V48_64K: case VM_MODE_P36V47_16K: - tcr_el1 |= 1ul << 32; /* IPS = 36 bits */ + tcr_el1 |= TCR_IPS_36_BITS; break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */; - /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */; - tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); - tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; + sctlr_el1 |= SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_I; + + tcr_el1 |= TCR_IRGN0_WBWA | TCR_ORGN0_WBWA | TCR_SH0_INNER; + tcr_el1 |= TCR_T0SZ(vm->va_bits); if (use_lpa2_pte_format(vm)) - tcr_el1 |= (1ul << 59) /* DS */; + tcr_el1 |= TCR_DS; vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); From c8631ea59b6523035ffb607634eef7bacc8947fe Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Sat, 5 Apr 2025 00:10:42 +0000 Subject: [PATCH 25/26] KVM: arm64: selftests: Explicitly set the page attrs to Inner-Shareable Atomic instructions such as 'ldset' in the guest have been observed to cause an EL1 data abort with FSC 0x35 (IMPLEMENTATION DEFINED fault (Unsupported Exclusive or Atomic access)) on Neoverse-N3. Per DDI0487L.a B2.2.6, atomic instructions are only architecturally guaranteed for Inner/Outer Shareable Normal Write-Back memory. For anything else the behavior is IMPLEMENTATION DEFINED and can lose atomicity, or, in this case, generate an abort. It would appear that selftests sets up the stage-1 mappings as Non Shareable, leading to the observed abort. Explicitly set the Shareability field to Inner Shareable for non-LPA2 page tables. Note that for the LPA2 page table format, translations for cacheable memory inherit the shareability attribute of the PTW, i.e. TCR_ELx.SH{0,1}. Suggested-by: Oliver Upton Signed-off-by: Raghavendra Rao Ananta Link: https://lore.kernel.org/r/20250405001042.1470552-3-rananta@google.com [oliver: Rephrase changelog] Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/include/arm64/processor.h | 1 + tools/testing/selftests/kvm/lib/arm64/processor.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index 7d88ff22013ab..b0fc0f945766f 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -113,6 +113,7 @@ #define PMD_TYPE_TABLE BIT(1) #define PTE_TYPE_PAGE BIT(1) +#define PTE_SHARED (UL(3) << 8) /* SH[1:0], inner shareable */ #define PTE_AF BIT(10) #define PTE_ADDR_MASK(page_shift) GENMASK(47, (page_shift)) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index da5802c8a59c1..9d69904cb6084 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -172,6 +172,9 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } pg_attr = PTE_AF | PTE_ATTRINDX(attr_idx) | PTE_TYPE_PAGE | PTE_VALID; + if (!use_lpa2_pte_format(vm)) + pg_attr |= PTE_SHARED; + *ptep = addr_pte(vm, paddr, pg_attr); } From a344e258acb0a7f0e7ed10a795c52d1baf705164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 7 Apr 2025 16:27:55 +0100 Subject: [PATCH 26/26] KVM: arm64: Use acquire/release to communicate FF-A version negotiation The pKVM FF-A proxy rejects FF-A requests other than FFA_VERSION until version negotiation is complete, which is signalled by setting the global 'has_version_negotiated' variable. To avoid excessive locking, this variable is checked directly from kvm_host_ffa_handler() in response to an FF-A call, but this can race against another CPU performing the negotiation and potentially lead to reading a torn value (incredibly unlikely for a 'bool') or problematic re-ordering of the accesses to 'has_version_negotiated' and 'hyp_ffa_version' whereby a stale version number could be read by __do_ffa_mem_xfer(). Use acquire/release primitives when writing 'has_version_negotiated' with the version lock held and when reading without the lock held. Cc: Sebastian Ene Cc: Sudeep Holla Cc: Quentin Perret Cc: Oliver Upton Cc: Marc Zyngier Fixes: c9c012625e12 ("KVM: arm64: Trap FFA_VERSION host call in pKVM") Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20250407152755.1041-1-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index e433dfab882aa..3369dd0c4009f 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -730,10 +730,10 @@ static void do_ffa_version(struct arm_smccc_res *res, hyp_ffa_version = ffa_req_version; } - if (hyp_ffa_post_init()) + if (hyp_ffa_post_init()) { res->a0 = FFA_RET_NOT_SUPPORTED; - else { - has_version_negotiated = true; + } else { + smp_store_release(&has_version_negotiated, true); res->a0 = hyp_ffa_version; } unlock: @@ -809,7 +809,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) if (!is_ffa_call(func_id)) return false; - if (!has_version_negotiated && func_id != FFA_VERSION) { + if (func_id != FFA_VERSION && + !smp_load_acquire(&has_version_negotiated)) { ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS); goto out_handled; }