From 00c22013467069197dc006c943ca1f0395ca8aaa Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Wed, 30 Mar 2022 09:43:06 -0700 Subject: [PATCH 01/24] KVM: SEV: Add cond_resched() to loop in sev_clflush_pages() Add resched to avoid warning from sev_clflush_pages() with large number of pages. Signed-off-by: Peter Gonda Cc: Sean Christopherson Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Message-Id: <20220330164306.2376085-1-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75fa6dd268f05..c2fe89ecdb2dd 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -465,6 +465,7 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) page_virtual = kmap_atomic(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); kunmap_atomic(page_virtual); + cond_resched(); } } From 1d0e84806047f38027d7572adb4702ef7c09b317 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 31 Mar 2022 22:13:59 +0000 Subject: [PATCH 02/24] KVM: x86/mmu: Resolve nx_huge_pages when kvm.ko is loaded Resolve nx_huge_pages to true/false when kvm.ko is loaded, leaving it as -1 is technically undefined behavior when its value is read out by param_get_bool(), as boolean values are supposed to be '0' or '1'. Alternatively, KVM could define a custom getter for the param, but the auto value doesn't depend on the vendor module in any way, and printing "auto" would be unnecessarily unfriendly to the user. In addition to fixing the undefined behavior, resolving the auto value also fixes the scenario where the auto value resolves to N and no vendor module is loaded. Previously, -1 would result in Y being printed even though KVM would ultimately disable the mitigation. Rename the existing MMU module init/exit helpers to clarify that they're invoked with respect to the vendor module, and add comments to document why KVM has two separate "module init" flows. ========================================================================= UBSAN: invalid-load in kernel/params.c:320:33 load of value 255 is not a valid value for type '_Bool' CPU: 6 PID: 892 Comm: tail Not tainted 5.17.0-rc3+ #799 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x34/0x44 ubsan_epilogue+0x5/0x40 __ubsan_handle_load_invalid_value.cold+0x43/0x48 param_get_bool.cold+0xf/0x14 param_attr_show+0x55/0x80 module_attr_show+0x1c/0x30 sysfs_kf_seq_show+0x93/0xc0 seq_read_iter+0x11c/0x450 new_sync_read+0x11b/0x1a0 vfs_read+0xf0/0x190 ksys_read+0x5f/0xe0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae ========================================================================= Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") Cc: stable@vger.kernel.org Reported-by: Bruno Goncalves Reported-by: Jan Stancek Signed-off-by: Sean Christopherson Message-Id: <20220331221359.3912754-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++++---- arch/x86/kvm/x86.c | 20 ++++++++++++++++++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d23e80a56eb86..0d37ba442de34 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1585,8 +1585,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) -int kvm_mmu_module_init(void); -void kvm_mmu_module_exit(void); +void kvm_mmu_x86_module_init(void); +int kvm_mmu_vendor_module_init(void); +void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8f19ea7527042..f9080ee50ffa0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6237,12 +6237,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) return 0; } -int kvm_mmu_module_init(void) +/* + * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as + * its default value of -1 is technically undefined behavior for a boolean. + */ +void kvm_mmu_x86_module_init(void) { - int ret = -ENOMEM; - if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); +} + +/* + * The bulk of the MMU initialization is deferred until the vendor module is + * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need + * to be reset when a potentially different vendor module is loaded. + */ +int kvm_mmu_vendor_module_init(void) +{ + int ret = -ENOMEM; /* * MMU roles use union aliasing which is, generally speaking, an @@ -6290,7 +6302,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) mmu_free_memory_caches(vcpu); } -void kvm_mmu_module_exit(void) +void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c0ca599a353c..de49a88df1c2e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8926,7 +8926,7 @@ int kvm_arch_init(void *opaque) } kvm_nr_uret_msrs = 0; - r = kvm_mmu_module_init(); + r = kvm_mmu_vendor_module_init(); if (r) goto out_free_percpu; @@ -8974,7 +8974,7 @@ void kvm_arch_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_ops.hardware_enable = NULL; - kvm_mmu_module_exit(); + kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); #ifdef CONFIG_KVM_XEN @@ -12986,3 +12986,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); + +static int __init kvm_x86_init(void) +{ + kvm_mmu_x86_module_init(); + return 0; +} +module_init(kvm_x86_init); + +static void __exit kvm_x86_exit(void) +{ + /* + * If module_init() is implemented, module_exit() must also be + * implemented to allow module unload. + */ +} +module_exit(kvm_x86_exit); From 3203a56a0f0eaaf4ea7fc01467378c4bce3841ff Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Fri, 1 Apr 2022 08:35:30 +0000 Subject: [PATCH 03/24] KVM: x86/mmu: remove unnecessary flush_workqueue() All work currently pending will be done first by calling destroy_workqueue, so there is unnecessary to flush it explicitly. Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Reviewed-by: Sean Christopherson Message-Id: <20220401083530.2407703-1-lv.ruyi@zte.com.cn> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d71d177ae6b87..c472769e03005 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -51,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; - flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); From c1be1ef1b4a7589878d63673b7b322856989064e Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sun, 3 Apr 2022 13:57:36 +0700 Subject: [PATCH 04/24] Documentation: kvm: Add missing line break in api.rst Add missing line break separator between literal block and description of KVM_EXIT_RISCV_SBI. This fixes: /Documentation/virt/kvm/api.rst:6118: WARNING: Literal block ends without a blank line; unexpected unindent. Fixes: da40d85805937d (RISC-V: KVM: Document RISC-V specific parts of KVM API, 2021-09-27) Cc: Anup Patel Cc: Paolo Bonzini Cc: Jonathan Corbet Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-riscv@lists.infradead.org Signed-off-by: Bagas Sanjaya Message-Id: <20220403065735.23859-1-bagasdotme@gmail.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d13fa66004672..85c7abc51af52 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6190,6 +6190,7 @@ Valid values for 'type' are: unsigned long args[6]; unsigned long ret[2]; } riscv_sbi; + If exit reason is KVM_EXIT_RISCV_SBI then it indicates that the VCPU has done a SBI call which is not handled by KVM RISC-V kernel module. The details of the SBI call are available in 'riscv_sbi' member of kvm_run structure. The From 2da0aebc74dba6a09ac90b88e38860fbc65d6c0a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 22 Mar 2022 18:35:36 +0000 Subject: [PATCH 05/24] KVM: arm64: Generally disallow SMC64 for AArch32 guests The only valid calling SMC calling convention from an AArch32 state is SMC32. Disallow any PSCI function that sets the SMC64 function ID bit when called from AArch32 rather than comparing against known SMC64 PSCI functions. Note that without this change KVM advertises the SMC64 flavor of SYSTEM_RESET2 to AArch32 guests. Fixes: d43583b890e7 ("KVM: arm64: Expose PSCI SYSTEM_RESET2 call to the guest") Acked-by: Will Deacon Reviewed-by: Reiji Watanabe Reviewed-by: Andrew Jones Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220322183538.2757758-2-oupton@google.com --- arch/arm64/kvm/psci.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 372da09a2fab6..a76d03d506241 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -215,15 +215,11 @@ static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) { - switch(fn) { - case PSCI_0_2_FN64_CPU_SUSPEND: - case PSCI_0_2_FN64_CPU_ON: - case PSCI_0_2_FN64_AFFINITY_INFO: - /* Disallow these functions for 32bit guests */ - if (vcpu_mode_is_32bit(vcpu)) - return PSCI_RET_NOT_SUPPORTED; - break; - } + /* + * Prevent 32 bit guests from calling 64 bit PSCI functions. + */ + if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu)) + return PSCI_RET_NOT_SUPPORTED; return 0; } From 827c2ab3314814e1c7d873372c0fe0cad50ba1c5 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 22 Mar 2022 18:35:37 +0000 Subject: [PATCH 06/24] KVM: arm64: Actually prevent SMC64 SYSTEM_RESET2 from AArch32 The SMCCC does not allow the SMC64 calling convention to be used from AArch32. While KVM checks to see if the calling convention is allowed in PSCI_1_0_FN_PSCI_FEATURES, it does not actually prevent calls to unadvertised PSCI v1.0+ functions. Hoist the check to see if the requested function is allowed into kvm_psci_call(), thereby preventing SMC64 calls from AArch32 for all PSCI versions. Fixes: d43583b890e7 ("KVM: arm64: Expose PSCI SYSTEM_RESET2 call to the guest") Acked-by: Will Deacon Reviewed-by: Reiji Watanabe Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220322183538.2757758-3-oupton@google.com --- arch/arm64/kvm/psci.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index a76d03d506241..faf403a72fdfd 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -231,10 +231,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) unsigned long val; int ret = 1; - val = kvm_psci_check_allowed_function(vcpu, psci_fn); - if (val) - goto out; - switch (psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: /* @@ -302,7 +298,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) break; } -out: smccc_set_retval(vcpu, val, 0, 0, 0); return ret; } @@ -422,6 +417,15 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) */ int kvm_psci_call(struct kvm_vcpu *vcpu) { + u32 psci_fn = smccc_get_function(vcpu); + unsigned long val; + + val = kvm_psci_check_allowed_function(vcpu, psci_fn); + if (val) { + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; + } + switch (kvm_psci_version(vcpu)) { case KVM_ARM_PSCI_1_1: return kvm_psci_1_x_call(vcpu, 1); From 73b725c7a6c82eee10fa2d6752babefff795ca9a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 22 Mar 2022 18:35:38 +0000 Subject: [PATCH 07/24] KVM: arm64: Drop unneeded minor version check from PSCI v1.x handler We already sanitize the guest's PSCI version when it is being written by userspace, rejecting unsupported version numbers. Additionally, the 'minor' parameter to kvm_psci_1_x_call() is a constant known at compile time for all callsites. Though it is benign, the additional check against the PSCI kvm_psci_1_x_call() is unnecessary and likely to be missed the next time KVM raises its maximum PSCI version. Drop the check altogether and rely on sanitization when the PSCI version is set by userspace. No functional change intended. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220322183538.2757758-4-oupton@google.com --- arch/arm64/kvm/psci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index faf403a72fdfd..baac2b405f235 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -309,9 +309,6 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) unsigned long val; int ret = 1; - if (minor > 1) - return -EINVAL; - switch(psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; From f587661f21eb9a38af52488bbe54ce61a64dfae8 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 1 Apr 2022 19:46:52 +0000 Subject: [PATCH 08/24] KVM: arm64: Don't split hugepages outside of MMU write lock It is possible to take a stage-2 permission fault on a page larger than PAGE_SIZE. For example, when running a guest backed by 2M HugeTLB, KVM eagerly maps at the largest possible block size. When dirty logging is enabled on a memslot, KVM does *not* eagerly split these 2M stage-2 mappings and instead clears the write bit on the pte. Since dirty logging is always performed at PAGE_SIZE granularity, KVM lazily splits these 2M block mappings down to PAGE_SIZE in the stage-2 fault handler. This operation must be done under the write lock. Since commit f783ef1c0e82 ("KVM: arm64: Add fast path to handle permission relaxation during dirty logging"), the stage-2 fault handler conditionally takes the read lock on permission faults with dirty logging enabled. To that end, it is possible to split a 2M block mapping while only holding the read lock. The problem is demonstrated by running kvm_page_table_test with 2M anonymous HugeTLB, which splats like so: WARNING: CPU: 5 PID: 15276 at arch/arm64/kvm/hyp/pgtable.c:153 stage2_map_walk_leaf+0x124/0x158 [...] Call trace: stage2_map_walk_leaf+0x124/0x158 stage2_map_walker+0x5c/0xf0 __kvm_pgtable_walk+0x100/0x1d4 __kvm_pgtable_walk+0x140/0x1d4 __kvm_pgtable_walk+0x140/0x1d4 kvm_pgtable_walk+0xa0/0xf8 kvm_pgtable_stage2_map+0x15c/0x198 user_mem_abort+0x56c/0x838 kvm_handle_guest_abort+0x1fc/0x2a4 handle_exit+0xa4/0x120 kvm_arch_vcpu_ioctl_run+0x200/0x448 kvm_vcpu_ioctl+0x588/0x664 __arm64_sys_ioctl+0x9c/0xd4 invoke_syscall+0x4c/0x144 el0_svc_common+0xc4/0x190 do_el0_svc+0x30/0x8c el0_svc+0x28/0xcc el0t_64_sync_handler+0x84/0xe4 el0t_64_sync+0x1a4/0x1a8 Fix the issue by only acquiring the read lock if the guest faulted on a PAGE_SIZE granule w/ dirty logging enabled. Add a WARN to catch locking bugs in future changes. Fixes: f783ef1c0e82 ("KVM: arm64: Add fast path to handle permission relaxation during dirty logging") Cc: Jing Zhang Signed-off-by: Oliver Upton Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220401194652.950240-1-oupton@google.com --- arch/arm64/kvm/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0d19259454d8c..53ae2c0640bc2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1079,7 +1079,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - bool logging_perm_fault = false; + bool use_read_lock = false; unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; @@ -1114,7 +1114,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; - logging_perm_fault = (fault_status == FSC_PERM && write_fault); + use_read_lock = (fault_status == FSC_PERM && write_fault && + fault_granule == PAGE_SIZE); } else { vma_shift = get_vma_page_shift(vma, hva); } @@ -1218,7 +1219,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * logging dirty logging, only acquire read lock for permission * relaxation. */ - if (logging_perm_fault) + if (use_read_lock) read_lock(&kvm->mmu_lock); else write_lock(&kvm->mmu_lock); @@ -1268,6 +1269,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); } else { + WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n"); + ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, memcache); @@ -1280,7 +1283,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - if (logging_perm_fault) + if (use_read_lock) read_unlock(&kvm->mmu_lock); else write_unlock(&kvm->mmu_lock); From c707663e81ef48d279719e97fd86acef835a2671 Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Tue, 29 Mar 2022 03:20:59 -0700 Subject: [PATCH 09/24] KVM: arm64: vgic: Remove unnecessary type castings Remove unnecessary casts. Signed-off-by: Yu Zhe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220329102059.268983-1-yuzhe@nfschina.com --- arch/arm64/kvm/vgic/vgic-debug.c | 10 +++++----- arch/arm64/kvm/vgic/vgic-its.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index f38c40a762519..78cde687383ca 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -82,7 +82,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) static void *vgic_debug_start(struct seq_file *s, loff_t *pos) { - struct kvm *kvm = (struct kvm *)s->private; + struct kvm *kvm = s->private; struct vgic_state_iter *iter; mutex_lock(&kvm->lock); @@ -110,7 +110,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) { - struct kvm *kvm = (struct kvm *)s->private; + struct kvm *kvm = s->private; struct vgic_state_iter *iter = kvm->arch.vgic.iter; ++*pos; @@ -122,7 +122,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) static void vgic_debug_stop(struct seq_file *s, void *v) { - struct kvm *kvm = (struct kvm *)s->private; + struct kvm *kvm = s->private; struct vgic_state_iter *iter; /* @@ -229,8 +229,8 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, static int vgic_debug_show(struct seq_file *s, void *v) { - struct kvm *kvm = (struct kvm *)s->private; - struct vgic_state_iter *iter = (struct vgic_state_iter *)v; + struct kvm *kvm = s->private; + struct vgic_state_iter *iter = v; struct vgic_irq *irq; struct kvm_vcpu *vcpu = NULL; unsigned long flags; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 089fc2ffcb43d..2e13402be3bd2 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2143,7 +2143,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, void *ptr, void *opaque) { - struct its_device *dev = (struct its_device *)opaque; + struct its_device *dev = opaque; struct its_collection *collection; struct kvm *kvm = its->dev->kvm; struct kvm_vcpu *vcpu = NULL; From 26bf74bd9f6ff0f1545b4f0c92a37c232d076014 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Mon, 28 Mar 2022 20:19:23 -0700 Subject: [PATCH 10/24] KVM: arm64: mixed-width check should be skipped for uninitialized vCPUs KVM allows userspace to configure either all EL1 32bit or 64bit vCPUs for a guest. At vCPU reset, vcpu_allowed_register_width() checks if the vcpu's register width is consistent with all other vCPUs'. Since the checking is done even against vCPUs that are not initialized (KVM_ARM_VCPU_INIT has not been done) yet, the uninitialized vCPUs are erroneously treated as 64bit vCPU, which causes the function to incorrectly detect a mixed-width VM. Introduce KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED bits for kvm->arch.flags. A value of the EL1_32BIT bit indicates that the guest needs to be configured with all 32bit or 64bit vCPUs, and a value of the REG_WIDTH_CONFIGURED bit indicates if a value of the EL1_32BIT bit is valid (already set up). Values in those bits are set at the first KVM_ARM_VCPU_INIT for the guest based on KVM_ARM_VCPU_EL1_32BIT configuration for the vCPU. Check vcpu's register width against those new bits at the vcpu's KVM_ARM_VCPU_INIT (instead of against other vCPUs' register width). Fixes: 66e94d5cafd4 ("KVM: arm64: Prevent mixed-width VM creation") Signed-off-by: Reiji Watanabe Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220329031924.619453-2-reijiw@google.com --- arch/arm64/include/asm/kvm_emulate.h | 27 ++++++++---- arch/arm64/include/asm/kvm_host.h | 10 +++++ arch/arm64/kvm/reset.c | 65 +++++++++++++++++++--------- 3 files changed, 74 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index d62405ce3e6de..7496deab025ad 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -43,10 +43,22 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_vcpu_wfi(struct kvm_vcpu *vcpu); +#if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { return !(vcpu->arch.hcr_el2 & HCR_RW); } +#else +static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, + &kvm->arch.flags)); + + return test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); +} +#endif static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { @@ -72,15 +84,14 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_TVM; } - if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) + if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; - - /* - * TID3: trap feature register accesses that we virtualise. - * For now this is conditional, since no AArch32 feature regs - * are currently virtualised. - */ - if (!vcpu_el1_is_32bit(vcpu)) + else + /* + * TID3: trap feature register accesses that we virtualise. + * For now this is conditional, since no AArch32 feature regs + * are currently virtualised. + */ vcpu->arch.hcr_el2 |= HCR_TID3; if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e3b25dc6c367a..94a27a7520f47 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -127,6 +127,16 @@ struct kvm_arch { #define KVM_ARCH_FLAG_MTE_ENABLED 1 /* At least one vCPU has ran in the VM */ #define KVM_ARCH_FLAG_HAS_RAN_ONCE 2 + /* + * The following two bits are used to indicate the guest's EL1 + * register width configuration. A value of KVM_ARCH_FLAG_EL1_32BIT + * bit is valid only when KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED is set. + * Otherwise, the guest's EL1 register width has not yet been + * determined yet. + */ +#define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3 +#define KVM_ARCH_FLAG_EL1_32BIT 4 + unsigned long flags; /* diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index ecc40c8cd6f64..6c70c6f61c703 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -181,27 +181,51 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) return 0; } -static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) +/** + * kvm_set_vm_width() - set the register width for the guest + * @vcpu: Pointer to the vcpu being configured + * + * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED + * in the VM flags based on the vcpu's requested register width, the HW + * capabilities and other options (such as MTE). + * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be + * consistent with the value of the FLAG_EL1_32BIT bit in the flags. + * + * Return: 0 on success, negative error code on failure. + */ +static int kvm_set_vm_width(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *tmp; + struct kvm *kvm = vcpu->kvm; bool is32bit; - unsigned long i; is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); + + lockdep_assert_held(&kvm->lock); + + if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) { + /* + * The guest's register width is already configured. + * Make sure that the vcpu is consistent with it. + */ + if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags)) + return 0; + + return -EINVAL; + } + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) - return false; + return -EINVAL; /* MTE is incompatible with AArch32 */ - if (kvm_has_mte(vcpu->kvm) && is32bit) - return false; + if (kvm_has_mte(kvm) && is32bit) + return -EINVAL; - /* Check that the vcpus are either all 32bit or all 64bit */ - kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit) - return false; - } + if (is32bit) + set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); - return true; + set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags); + + return 0; } /** @@ -230,10 +254,16 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) u32 pstate; mutex_lock(&vcpu->kvm->lock); - reset_state = vcpu->arch.reset_state; - WRITE_ONCE(vcpu->arch.reset_state.reset, false); + ret = kvm_set_vm_width(vcpu); + if (!ret) { + reset_state = vcpu->arch.reset_state; + WRITE_ONCE(vcpu->arch.reset_state.reset, false); + } mutex_unlock(&vcpu->kvm->lock); + if (ret) + return ret; + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -260,14 +290,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } } - if (!vcpu_allowed_register_width(vcpu)) { - ret = -EINVAL; - goto out; - } - switch (vcpu->arch.target) { default: - if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { + if (vcpu_el1_is_32bit(vcpu)) { pstate = VCPU_RESET_PSTATE_SVC; } else { pstate = VCPU_RESET_PSTATE_EL1; From 2f5d27e6cf14efe652748bad89ee529ed5a5d577 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Mon, 28 Mar 2022 20:19:24 -0700 Subject: [PATCH 11/24] KVM: arm64: selftests: Introduce vcpu_width_config Introduce a test for aarch64 that ensures non-mixed-width vCPUs (all 64bit vCPUs or all 32bit vcPUs) can be configured, and mixed-width vCPUs cannot be configured. Reviewed-by: Andrew Jones Signed-off-by: Reiji Watanabe Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220329031924.619453-3-reijiw@google.com --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/vcpu_width_config.c | 122 ++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/vcpu_width_config.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d1e8f52374697..573d93a1d61f0 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -3,6 +3,7 @@ /aarch64/debug-exceptions /aarch64/get-reg-list /aarch64/psci_cpu_on_test +/aarch64/vcpu_width_config /aarch64/vgic_init /aarch64/vgic_irq /s390x/memop diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 21c2dbd21a81c..681b173aa87c1 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -106,6 +106,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test +TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq TEST_GEN_PROGS_aarch64 += demand_paging_test diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c new file mode 100644 index 0000000000000..6e9402679229e --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vcpu_width_config - Test KVM_ARM_VCPU_INIT() with KVM_ARM_VCPU_EL1_32BIT. + * + * Copyright (c) 2022 Google LLC. + * + * This is a test that ensures that non-mixed-width vCPUs (all 64bit vCPUs + * or all 32bit vcPUs) can be configured and mixed-width vCPUs cannot be + * configured. + */ + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + + +/* + * Add a vCPU, run KVM_ARM_VCPU_INIT with @init1, and then + * add another vCPU, and run KVM_ARM_VCPU_INIT with @init2. + */ +static int add_init_2vcpus(struct kvm_vcpu_init *init1, + struct kvm_vcpu_init *init2) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + + vm_vcpu_add(vm, 0); + ret = _vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); + if (ret) + goto free_exit; + + vm_vcpu_add(vm, 1); + ret = _vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); + +free_exit: + kvm_vm_free(vm); + return ret; +} + +/* + * Add two vCPUs, then run KVM_ARM_VCPU_INIT for one vCPU with @init1, + * and run KVM_ARM_VCPU_INIT for another vCPU with @init2. + */ +static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, + struct kvm_vcpu_init *init2) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + + vm_vcpu_add(vm, 0); + vm_vcpu_add(vm, 1); + + ret = _vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); + if (ret) + goto free_exit; + + ret = _vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); + +free_exit: + kvm_vm_free(vm); + return ret; +} + +/* + * Tests that two 64bit vCPUs can be configured, two 32bit vCPUs can be + * configured, and two mixed-width vCPUs cannot be configured. + * Each of those three cases, configure vCPUs in two different orders. + * The one is running KVM_CREATE_VCPU for 2 vCPUs, and then running + * KVM_ARM_VCPU_INIT for them. + * The other is running KVM_CREATE_VCPU and KVM_ARM_VCPU_INIT for a vCPU, + * and then run those commands for another vCPU. + */ +int main(void) +{ + struct kvm_vcpu_init init1, init2; + struct kvm_vm *vm; + int ret; + + if (!kvm_check_cap(KVM_CAP_ARM_EL1_32BIT)) { + print_skip("KVM_CAP_ARM_EL1_32BIT is not supported"); + exit(KSFT_SKIP); + } + + /* Get the preferred target type and copy that to init2 for later use */ + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init1); + kvm_vm_free(vm); + init2 = init1; + + /* Test with 64bit vCPUs */ + ret = add_init_2vcpus(&init1, &init1); + TEST_ASSERT(ret == 0, + "Configuring 64bit EL1 vCPUs failed unexpectedly"); + ret = add_2vcpus_init_2vcpus(&init1, &init1); + TEST_ASSERT(ret == 0, + "Configuring 64bit EL1 vCPUs failed unexpectedly"); + + /* Test with 32bit vCPUs */ + init1.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); + ret = add_init_2vcpus(&init1, &init1); + TEST_ASSERT(ret == 0, + "Configuring 32bit EL1 vCPUs failed unexpectedly"); + ret = add_2vcpus_init_2vcpus(&init1, &init1); + TEST_ASSERT(ret == 0, + "Configuring 32bit EL1 vCPUs failed unexpectedly"); + + /* Test with mixed-width vCPUs */ + init1.features[0] = 0; + init2.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); + ret = add_init_2vcpus(&init1, &init2); + TEST_ASSERT(ret != 0, + "Configuring mixed-width vCPUs worked unexpectedly"); + ret = add_2vcpus_init_2vcpus(&init1, &init2); + TEST_ASSERT(ret != 0, + "Configuring mixed-width vCPUs worked unexpectedly"); + + return 0; +} From 5593473a1e6c743764b08e3b6071cb43b5cfa6c4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 6 Apr 2022 13:13:42 -0400 Subject: [PATCH 12/24] KVM: avoid NULL pointer dereference in kvm_dirty_ring_push kvm_vcpu_release() will call kvm_dirty_ring_free(), freeing ring->dirty_gfns and setting it to NULL. Afterwards, it calls kvm_arch_vcpu_destroy(). However, if closing the file descriptor races with KVM_RUN in such away that vcpu->arch.st.preempted == 0, the following call stack leads to a NULL pointer dereference in kvm_dirty_run_push(): mark_page_dirty_in_slot+0x192/0x270 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3171 kvm_steal_time_set_preempted arch/x86/kvm/x86.c:4600 [inline] kvm_arch_vcpu_put+0x34e/0x5b0 arch/x86/kvm/x86.c:4618 vcpu_put+0x1b/0x70 arch/x86/kvm/../../../virt/kvm/kvm_main.c:211 vmx_free_vcpu+0xcb/0x130 arch/x86/kvm/vmx/vmx.c:6985 kvm_arch_vcpu_destroy+0x76/0x290 arch/x86/kvm/x86.c:11219 kvm_vcpu_destroy arch/x86/kvm/../../../virt/kvm/kvm_main.c:441 [inline] The fix is to release the dirty page ring after kvm_arch_vcpu_destroy has run. Reported-by: Qiuhao Li Reported-by: Gaoning Pan Reported-by: Yongkang Jia Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 70e05af5ebead..b22f380e3347d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -434,8 +434,8 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { - kvm_dirty_ring_free(&vcpu->dirty_ring); kvm_arch_vcpu_destroy(vcpu); + kvm_dirty_ring_free(&vcpu->dirty_ring); /* * No need for rcu_read_lock as VCPU_RUN is the only place that changes From 02de9331c4d0c6bddac9c5fa66d91f70adf8612b Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 16 Mar 2022 13:51:29 +0100 Subject: [PATCH 13/24] KVM: selftests: get-reg-list: Add KVM_REG_ARM_FW_REG(3) When testing a kernel with commit a5905d6af492 ("KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated") get-reg-list output vregs: Number blessed registers: 234 vregs: Number registers: 238 vregs: There are 1 new registers. Consider adding them to the blessed reg list with the following lines: KVM_REG_ARM_FW_REG(3), vregs: PASS ... That output inspired two changes: 1) add the new register to the blessed list and 2) explain why "Number registers" is actually four larger than "Number blessed registers" (on the system used for testing), even though only one register is being stated as new. The reason is that some registers are host dependent and they get filtered out when comparing with the blessed list. The system used for the test apparently had three filtered registers. Signed-off-by: Andrew Jones Acked-by: Marc Zyngier Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220316125129.392128-1-drjones@redhat.com --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index f12147c43464e..0b571f3fe64ce 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -503,8 +503,13 @@ static void run_test(struct vcpu_config *c) ++missing_regs; if (new_regs || missing_regs) { + n = 0; + for_each_reg_filtered(i) + ++n; + printf("%s: Number blessed registers: %5lld\n", config_name(c), blessed_n); - printf("%s: Number registers: %5lld\n", config_name(c), reg_list->n); + printf("%s: Number registers: %5lld (includes %lld filtered registers)\n", + config_name(c), reg_list->n, reg_list->n - n); } if (new_regs) { @@ -683,9 +688,10 @@ static __u64 base_regs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]), KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr), KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr), - KVM_REG_ARM_FW_REG(0), - KVM_REG_ARM_FW_REG(1), - KVM_REG_ARM_FW_REG(2), + KVM_REG_ARM_FW_REG(0), /* KVM_REG_ARM_PSCI_VERSION */ + KVM_REG_ARM_FW_REG(1), /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 */ + KVM_REG_ARM_FW_REG(2), /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 */ + KVM_REG_ARM_FW_REG(3), /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 */ ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 2), From a44a4cc1c969afec97dbb2aedaf6f38eaa6253bb Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 6 Apr 2022 23:56:13 +0000 Subject: [PATCH 14/24] KVM: Don't create VM debugfs files outside of the VM directory Unfortunately, there is no guarantee that KVM was able to instantiate a debugfs directory for a particular VM. To that end, KVM shouldn't even attempt to create new debugfs files in this case. If the specified parent dentry is NULL, debugfs_create_file() will instantiate files at the root of debugfs. For arm64, it is possible to create the vgic-state file outside of a VM directory, the file is not cleaned up when a VM is destroyed. Nonetheless, the corresponding struct kvm is freed when the VM is destroyed. Nip the problem in the bud for all possible errant debugfs file creations by initializing kvm->debugfs_dentry to -ENOENT. In so doing, debugfs_create_file() will fail instead of creating the file in the root directory. Cc: stable@kernel.org Fixes: 929f45e32499 ("kvm: no need to check return value of debugfs_create functions") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220406235615.1447180-2-oupton@google.com --- virt/kvm/kvm_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 70e05af5ebead..e39a6f56fc47e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -932,7 +932,7 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm) int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; - if (!kvm->debugfs_dentry) + if (IS_ERR(kvm->debugfs_dentry)) return; debugfs_remove_recursive(kvm->debugfs_dentry); @@ -955,6 +955,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; + /* + * Force subsequent debugfs file creations to fail if the VM directory + * is not created. + */ + kvm->debugfs_dentry = ERR_PTR(-ENOENT); + if (!debugfs_initialized()) return 0; @@ -5479,7 +5485,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } add_uevent_var(env, "PID=%d", kvm->userspace_pid); - if (kvm->debugfs_dentry) { + if (!IS_ERR(kvm->debugfs_dentry)) { char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); if (p) { From 386ba265a8197716076a88853244f4437b92b167 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 6 Apr 2022 23:56:14 +0000 Subject: [PATCH 15/24] selftests: KVM: Don't leak GIC FD across dirty log test iterations dirty_log_perf_test instantiates a VGICv3 for the guest (if supported by hardware) to reduce the overhead of guest exits. However, the test does not actually close the GIC fd when cleaning up the VM between test iterations, meaning that the VM is never actually destroyed in the kernel. While this is generally a bad idea, the bug was detected from the kernel spewing about duplicate debugfs entries as subsequent VMs happen to reuse the same FD even though the debugfs directory is still present. Abstract away the notion of setup/cleanup of the GIC FD from the test by creating arch-specific helpers for test setup/cleanup. Close the GIC FD on VM cleanup and do nothing for the other architectures. Fixes: c340f7899af6 ("KVM: selftests: Add vgic initialization for dirty log perf test for ARM") Reviewed-by: Jing Zhang Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220406235615.1447180-3-oupton@google.com --- .../selftests/kvm/dirty_log_perf_test.c | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index c9d9e513ca044..7b47ae4f952e6 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -18,11 +18,40 @@ #include "test_util.h" #include "perf_test_util.h" #include "guest_modes.h" + #ifdef __aarch64__ #include "aarch64/vgic.h" #define GICD_BASE_GPA 0x8000000ULL #define GICR_BASE_GPA 0x80A0000ULL + +static int gic_fd; + +static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) +{ + /* + * The test can still run even if hardware does not support GICv3, as it + * is only an optimization to reduce guest exits. + */ + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); +} + +static void arch_cleanup_vm(struct kvm_vm *vm) +{ + if (gic_fd > 0) + close(gic_fd); +} + +#else /* __aarch64__ */ + +static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) +{ +} + +static void arch_cleanup_vm(struct kvm_vm *vm) +{ +} + #endif /* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ @@ -206,9 +235,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm_enable_cap(vm, &cap); } -#ifdef __aarch64__ - vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); -#endif + arch_setup_vm(vm, nr_vcpus); /* Start the iterations */ iteration = 0; @@ -302,6 +329,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) } free_bitmaps(bitmaps, p->slots); + arch_cleanup_vm(vm); perf_test_destroy_vm(vm); } From 21db83846683d3987666505a3ec38f367708199a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 6 Apr 2022 23:56:15 +0000 Subject: [PATCH 16/24] selftests: KVM: Free the GIC FD when cleaning up in arch_timer In order to correctly destroy a VM, all references to the VM must be freed. The arch_timer selftest creates a VGIC for the guest, which itself holds a reference to the VM. Close the GIC FD when cleaning up a VM. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220406235615.1447180-4-oupton@google.com --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index b08d30bf71c51..3b940a101bc07 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -362,11 +362,12 @@ static void test_init_timer_irq(struct kvm_vm *vm) pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq); } +static int gic_fd; + static struct kvm_vm *test_vm_create(void) { struct kvm_vm *vm; unsigned int i; - int ret; int nr_vcpus = test_args.nr_vcpus; vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); @@ -383,8 +384,8 @@ static struct kvm_vm *test_vm_create(void) ucall_init(vm, NULL); test_init_timer_irq(vm); - ret = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); - if (ret < 0) { + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + if (gic_fd < 0) { print_skip("Failed to create vgic-v3"); exit(KSFT_SKIP); } @@ -395,6 +396,12 @@ static struct kvm_vm *test_vm_create(void) return vm; } +static void test_vm_cleanup(struct kvm_vm *vm) +{ + close(gic_fd); + kvm_vm_free(vm); +} + static void test_print_help(char *name) { pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n", @@ -478,7 +485,7 @@ int main(int argc, char *argv[]) vm = test_vm_create(); test_run(vm); - kvm_vm_free(vm); + test_vm_cleanup(vm); return 0; } From 8c3ce496bd612bd21679e445f75fcabb6be997b2 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sat, 9 Apr 2022 09:15:33 +0530 Subject: [PATCH 17/24] RISC-V: KVM: Don't clear hgatp CSR in kvm_arch_vcpu_put() We might have RISC-V systems (such as QEMU) where VMID is not part of the TLB entry tag so these systems will have to flush all TLB entries upon any change in hgatp.VMID. Currently, we zero-out hgatp CSR in kvm_arch_vcpu_put() and we re-program hgatp CSR in kvm_arch_vcpu_load(). For above described systems, this will flush all TLB entries whenever VCPU exits to user-space hence reducing performance. This patch fixes above described performance issue by not clearing hgatp CSR in kvm_arch_vcpu_put(). Fixes: 34bde9d8b9e6 ("RISC-V: KVM: Implement VCPU world-switch") Cc: stable@vger.kernel.org Signed-off-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 624166004e36c..6785aef4cbd46 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -653,8 +653,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.isa); kvm_riscv_vcpu_host_fp_restore(&vcpu->arch.host_context); - csr_write(CSR_HGATP, 0); - csr->vsstatus = csr_read(CSR_VSSTATUS); csr->vsie = csr_read(CSR_VSIE); csr->vstvec = csr_read(CSR_VSTVEC); From fac3725364397f9a40a101f089b86ea655a58d06 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sat, 9 Apr 2022 09:15:44 +0530 Subject: [PATCH 18/24] KVM: selftests: riscv: Set PTE A and D bits in VS-stage page table Supporting hardware updates of PTE A and D bits is optional for any RISC-V implementation so current software strategy is to always set these bits in both G-stage (hypervisor) and VS-stage (guest kernel). If PTE A and D bits are not set by software (hypervisor or guest) then RISC-V implementations not supporting hardware updates of these bits will cause traps even for perfectly valid PTEs. Based on above explanation, the VS-stage page table created by various KVM selftest applications is not correct because PTE A and D bits are not set. This patch fixes VS-stage page table programming of PTE A and D bits for KVM selftests. Fixes: 3e06cdf10520 ("KVM: selftests: Add initial support for RISC-V 64-bit") Signed-off-by: Anup Patel Tested-by: Mayuresh Chitale Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/include/riscv/processor.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index dc284c6bdbc37..eca5c622efd25 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -101,7 +101,9 @@ static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, #define PGTBL_PTE_WRITE_SHIFT 2 #define PGTBL_PTE_READ_MASK 0x0000000000000002ULL #define PGTBL_PTE_READ_SHIFT 1 -#define PGTBL_PTE_PERM_MASK (PGTBL_PTE_EXECUTE_MASK | \ +#define PGTBL_PTE_PERM_MASK (PGTBL_PTE_ACCESSED_MASK | \ + PGTBL_PTE_DIRTY_MASK | \ + PGTBL_PTE_EXECUTE_MASK | \ PGTBL_PTE_WRITE_MASK | \ PGTBL_PTE_READ_MASK) #define PGTBL_PTE_VALID_MASK 0x0000000000000001ULL From ebdef0de2dbc40e697adaa6b3408130f7a7b8351 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sat, 9 Apr 2022 09:15:51 +0530 Subject: [PATCH 19/24] KVM: selftests: riscv: Fix alignment of the guest_hang() function The guest_hang() function is used as the default exception handler for various KVM selftests applications by setting it's address in the vstvec CSR. The vstvec CSR requires exception handler base address to be at least 4-byte aligned so this patch fixes alignment of the guest_hang() function. Fixes: 3e06cdf10520 ("KVM: selftests: Add initial support for RISC-V 64-bit") Signed-off-by: Anup Patel Tested-by: Mayuresh Chitale Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index d377f2603d98a..3961487a4870d 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -268,7 +268,7 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) core.regs.t3, core.regs.t4, core.regs.t5, core.regs.t6); } -static void guest_hang(void) +static void __aligned(16) guest_hang(void) { while (1) ; From 4054eee9290248bf66c5eacb58879c9aaad37f71 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Sat, 9 Apr 2022 09:16:00 +0530 Subject: [PATCH 20/24] RISC-V: KVM: include missing hwcap.h into vcpu_fp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vcpu_fp uses the riscv_isa_extension mechanism which gets defined in hwcap.h but doesn't include that head file. While it seems to work in most cases, in certain conditions this can lead to build failures like ../arch/riscv/kvm/vcpu_fp.c: In function ‘kvm_riscv_vcpu_fp_reset’: ../arch/riscv/kvm/vcpu_fp.c:22:13: error: implicit declaration of function ‘riscv_isa_extension_available’ [-Werror=implicit-function-declaration] 22 | if (riscv_isa_extension_available(&isa, f) || | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../arch/riscv/kvm/vcpu_fp.c:22:49: error: ‘f’ undeclared (first use in this function) 22 | if (riscv_isa_extension_available(&isa, f) || Fix this by simply including the necessary header. Fixes: 0a86512dc113 ("RISC-V: KVM: Factor-out FP virtualization into separate sources") Signed-off-by: Heiko Stuebner Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_fp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c index 4449a976e5a6b..d4308c5120078 100644 --- a/arch/riscv/kvm/vcpu_fp.c +++ b/arch/riscv/kvm/vcpu_fp.c @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_FPU void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) From 0c8b6641c8410930e2a2f4a437ac3f987fbf9404 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 6 Apr 2022 14:37:13 +0800 Subject: [PATCH 21/24] selftests: kvm: add tsc_scaling_sync to .gitignore The tsc_scaling_sync's binary should be present in the .gitignore file for the git to ignore it. Signed-off-by: Like Xu Message-Id: <20220406063715.55625-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 573d93a1d61f0..0b0e4402bba6a 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -34,6 +34,7 @@ /x86_64/state_test /x86_64/svm_vmcall_test /x86_64/svm_int_ctl_test +/x86_64/tsc_scaling_sync /x86_64/sync_regs_test /x86_64/tsc_msrs_test /x86_64/userspace_io_test From af105c9cc9ec8fdc087827a98d4b9dc10d61c358 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 6 Apr 2022 14:37:15 +0800 Subject: [PATCH 22/24] Documentation: KVM: Add SPDX-License-Identifier tag +new file mode 100644 +WARNING: Missing or malformed SPDX-License-Identifier tag in line 1 +#27: FILE: Documentation/virt/kvm/x86/errata.rst:1: Opportunistically update all other non-added KVM documents and remove a new extra blank line at EOF for x86/errata.rst. Signed-off-by: Like Xu Message-Id: <20220406063715.55625-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/vcpu-requests.rst | 2 ++ Documentation/virt/kvm/x86/amd-memory-encryption.rst | 2 ++ Documentation/virt/kvm/x86/errata.rst | 2 +- Documentation/virt/kvm/x86/running-nested-guests.rst | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst index db43ee571f5aa..31f62b64e07b9 100644 --- a/Documentation/virt/kvm/vcpu-requests.rst +++ b/Documentation/virt/kvm/vcpu-requests.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ================= KVM VCPU Requests ================= diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 1c6847fff3049..2d307811978c4 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ====================================== Secure Encrypted Virtualization (SEV) ====================================== diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst index 806f049b69755..410e0aa634939 100644 --- a/Documentation/virt/kvm/x86/errata.rst +++ b/Documentation/virt/kvm/x86/errata.rst @@ -1,3 +1,4 @@ +.. SPDX-License-Identifier: GPL-2.0 ======================================= Known limitations of CPU virtualization @@ -36,4 +37,3 @@ Nested virtualization features ------------------------------ TBD - diff --git a/Documentation/virt/kvm/x86/running-nested-guests.rst b/Documentation/virt/kvm/x86/running-nested-guests.rst index bd70c69468aeb..a27e6768d9008 100644 --- a/Documentation/virt/kvm/x86/running-nested-guests.rst +++ b/Documentation/virt/kvm/x86/running-nested-guests.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ============================== Running nested guests with KVM ============================== From c538dc792ff7e456d777f585fdf96aa4e781ed66 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Fri, 8 Apr 2022 08:37:10 -0500 Subject: [PATCH 23/24] KVM: SVM: Do not activate AVIC for SEV-enabled guest Since current AVIC implementation cannot support encrypted memory, inhibit AVIC for SEV-enabled guest. Signed-off-by: Suravee Suthikulpanit Message-Id: <20220408133710.54275-1-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/avic.c | 3 ++- arch/x86/kvm/svm/sev.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0d37ba442de34..92843fcdc1cfa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1052,6 +1052,7 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_X2APIC, APICV_INHIBIT_REASON_BLOCKIRQ, APICV_INHIBIT_REASON_ABSENT, + APICV_INHIBIT_REASON_SEV, }; struct kvm_arch { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a1cf9c31273b7..421619540ff9d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -837,7 +837,8 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | BIT(APICV_INHIBIT_REASON_X2APIC) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ); + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | + BIT(APICV_INHIBIT_REASON_SEV); return supported & BIT(reason); } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c2fe89ecdb2dd..537aaddc852fc 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -260,6 +260,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); + return 0; e_free: From 42dcbe7d8bac997eef4c379e61d9121a15ed4e36 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 7 Apr 2022 22:10:13 +0200 Subject: [PATCH 24/24] KVM: x86: hyper-v: Avoid writing to TSC page without an active vCPU The following WARN is triggered from kvm_vm_ioctl_set_clock(): WARNING: CPU: 10 PID: 579353 at arch/x86/kvm/../../../virt/kvm/kvm_main.c:3161 mark_page_dirty_in_slot+0x6c/0x80 [kvm] ... CPU: 10 PID: 579353 Comm: qemu-system-x86 Tainted: G W O 5.16.0.stable #20 Hardware name: LENOVO 20UF001CUS/20UF001CUS, BIOS R1CET65W(1.34 ) 06/17/2021 RIP: 0010:mark_page_dirty_in_slot+0x6c/0x80 [kvm] ... Call Trace: ? kvm_write_guest+0x114/0x120 [kvm] kvm_hv_invalidate_tsc_page+0x9e/0xf0 [kvm] kvm_arch_vm_ioctl+0xa26/0xc50 [kvm] ? schedule+0x4e/0xc0 ? __cond_resched+0x1a/0x50 ? futex_wait+0x166/0x250 ? __send_signal+0x1f1/0x3d0 kvm_vm_ioctl+0x747/0xda0 [kvm] ... The WARN was introduced by commit 03c0304a86bc ("KVM: Warn if mark_page_dirty() is called without an active vCPU") but the change seems to be correct (unlike Hyper-V TSC page update mechanism). In fact, there's no real need to actually write to guest memory to invalidate TSC page, this can be done by the first vCPU which goes through kvm_guest_time_update(). Reported-by: Maxim Levitsky Reported-by: Naresh Kamboju Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20220407201013.963226-1-vkuznets@redhat.com> --- arch/x86/include/asm/kvm_host.h | 4 +--- arch/x86/kvm/hyperv.c | 40 +++++++-------------------------- arch/x86/kvm/hyperv.h | 2 +- arch/x86/kvm/x86.c | 7 +++--- 4 files changed, 13 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 92843fcdc1cfa..e0c0f0e1f754c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -974,12 +974,10 @@ enum hv_tsc_page_status { HV_TSC_PAGE_UNSET = 0, /* TSC page MSR was written by the guest, update pending */ HV_TSC_PAGE_GUEST_CHANGED, - /* TSC page MSR was written by KVM userspace, update pending */ + /* TSC page update was triggered from the host side */ HV_TSC_PAGE_HOST_CHANGED, /* TSC page was properly set up and is currently active */ HV_TSC_PAGE_SET, - /* TSC page is currently being updated and therefore is inactive */ - HV_TSC_PAGE_UPDATING, /* TSC page was set up with an inaccessible GPA */ HV_TSC_PAGE_BROKEN, }; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 123b677111c58..46f9dfb604694 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1135,11 +1135,13 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); + mutex_lock(&hv->hv_lock); + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) - return; + goto out_unlock; - mutex_lock(&hv->hv_lock); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) goto out_unlock; @@ -1201,45 +1203,19 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, mutex_unlock(&hv->hv_lock); } -void kvm_hv_invalidate_tsc_page(struct kvm *kvm) +void kvm_hv_request_tsc_page_update(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); - u64 gfn; - int idx; - - if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || - hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || - tsc_page_update_unsafe(hv)) - return; mutex_lock(&hv->hv_lock); - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - goto out_unlock; - - /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */ - if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET) - hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING; + if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET && + !tsc_page_update_unsafe(hv)) + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; - gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - - hv->tsc_ref.tsc_sequence = 0; - - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&kvm->srcu); - if (kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; - srcu_read_unlock(&kvm->srcu, idx); - -out_unlock: mutex_unlock(&hv->hv_lock); } - static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) { if (!hv_vcpu->enforce_cpuid) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e19c00ee9ab33..da2737f2a956c 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -137,7 +137,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); -void kvm_hv_invalidate_tsc_page(struct kvm *kvm); +void kvm_hv_request_tsc_page_update(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de49a88df1c2e..547ba00ef64fc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2901,7 +2901,7 @@ static void kvm_end_pvclock_update(struct kvm *kvm) static void kvm_update_masterclock(struct kvm *kvm) { - kvm_hv_invalidate_tsc_page(kvm); + kvm_hv_request_tsc_page_update(kvm); kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); kvm_end_pvclock_update(kvm); @@ -3113,8 +3113,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_set) kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); - if (!v->vcpu_idx) - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); + kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -6241,7 +6240,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) if (data.flags & ~KVM_CLOCK_VALID_FLAGS) return -EINVAL; - kvm_hv_invalidate_tsc_page(kvm); + kvm_hv_request_tsc_page_update(kvm); kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm);