From 46c49372e10ea161beaa78936a64d2b49531dffb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 25 Feb 2025 13:01:41 -0500 Subject: [PATCH 1/4] KVM: x86: move vm_destroy callback at end of kvm_arch_destroy_vm TDX needs to free the TDR control structures last, after all paging structures have been torn down; move the vm_destroy callback at a suitable place. The new place is also okay for AMD; the main difference is that the MMU has been torn down and, if anything, that is better done before the SNP ASID is released. Extracted from a patch by Yan Zhao. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b64ab350bcd..bd2b71d4e64f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12880,7 +12880,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } kvm_unload_vcpu_mmus(kvm); kvm_destroy_vcpus(kvm); - kvm_x86_call(vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); @@ -12890,6 +12889,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); + kvm_x86_call(vm_destroy)(kvm); } static void memslot_rmap_free(struct kvm_memory_slot *slot) From 5f3b30b2b0d92e5a5050959a0994b2f3c4e1e6d0 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Sat, 12 Oct 2024 00:55:55 -0700 Subject: [PATCH 2/4] KVM: x86: Push down setting vcpu.arch.user_set_tsc Push down setting vcpu.arch.user_set_tsc to true from kvm_synchronize_tsc() to __kvm_synchronize_tsc() so that the two callers don't have to modify user_set_tsc directly as preparation. Later, prohibit changing TSC synchronization for TDX guests to modify __kvm_synchornize_tsc() change. We don't want to touch caller sites not to change user_set_tsc. Signed-off-by: Isaku Yamahata Message-ID: <62b1a7a35d6961844786b6e47e8ecb774af7a228.1728719037.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd2b71d4e64f..2da75bbf7f94 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2626,12 +2626,15 @@ static inline bool kvm_check_tsc_unstable(void) * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, - u64 ns, bool matched) + u64 ns, bool matched, bool user_set_tsc) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. @@ -2717,8 +2720,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) } } - if (user_value) - kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable @@ -2738,7 +2739,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) matched = true; } - __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } @@ -5725,8 +5726,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); - kvm->arch.user_set_tsc = true; - __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); r = 0; From adafea1106004dba26ea12d3193f4f132b1f0065 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Sat, 12 Oct 2024 00:55:56 -0700 Subject: [PATCH 3/4] KVM: x86: Add infrastructure for secure TSC Add guest_tsc_protected member to struct kvm_arch_vcpu and prohibit changing TSC offset/multiplier when guest_tsc_protected is true. X86 confidential computing technology defines protected guest TSC so that the VMM can't change the TSC offset/multiplier once vCPU is initialized. SEV-SNP defines Secure TSC as optional, whereas TDX mandates it. KVM has common logic on x86 that tries to guess or adjust TSC offset/multiplier for better guest TSC and TSC interrupt latency at KVM vCPU creation (kvm_arch_vcpu_postcreate()), vCPU migration over pCPU (kvm_arch_vcpu_load()), vCPU TSC device attributes (kvm_arch_tsc_set_attr()) and guest/host writing to TSC or TSC adjust MSR (kvm_set_msr_common()). The current x86 KVM implementation conflicts with protected TSC because the VMM can't change the TSC offset/multiplier. Because KVM emulates the TSC timer or the TSC deadline timer with the TSC offset/multiplier, the TSC timer interrupts is injected to the guest at the wrong time if the KVM TSC offset is different from what the TDX module determined. Originally this issue was found by cyclic test of rt-test [1] as the latency in TDX case is worse than VMX value + TDX SEAMCALL overhead. It turned out that the KVM TSC offset is different from what the TDX module determines. Disable or ignore the KVM logic to change/adjust the TSC offset/multiplier somehow, thus keeping the KVM TSC offset/multiplier the same as the value of the TDX module. Writes to MSR_IA32_TSC are also blocked as they amount to a change in the TSC offset. [1] https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git Reported-by: Marcelo Tosatti Signed-off-by: Isaku Yamahata Message-ID: <3a7444aec08042fe205666864b6858910e86aa98.1728719037.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32ae3aa50c7e..ee55d1f753e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1053,6 +1053,7 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + bool guest_tsc_protected; /* * Set when PDPTS were loaded directly by the userspace without diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2da75bbf7f94..053547fc6f89 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2569,6 +2569,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2632,6 +2635,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (vcpu->arch.guest_tsc_protected) + return; + if (user_set_tsc) vcpu->kvm->arch.user_set_tsc = true; @@ -3907,7 +3913,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); - } else { + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; @@ -4987,7 +4993,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) From 74c1807f6c4feddb3c3cb1056c54531d4adbaea6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 Mar 2025 21:29:22 +0100 Subject: [PATCH 4/4] KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM_CAP_SYNC_REGS does not make sense for VMs with protected guest state, since the register values cannot actually be written. Return 0 when using the VM-level KVM_CHECK_EXTENSION ioctl, and accordingly return -EINVAL from KVM_RUN if the valid/dirty fields are nonzero. However, on exit from KVM_RUN userspace could have placed a nonzero value into kvm_run->kvm_valid_regs, so check guest_state_protected again and skip store_regs() in that case. Cc: stable@vger.kernel.org Fixes: 517987e3fb19 ("KVM: x86: add fields to struct kvm_arch for CoCo features") Signed-off-by: Paolo Bonzini Message-ID: <20250306202923.646075-1-pbonzini@redhat.com> Reviewed-by: Pankaj Gupta Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 053547fc6f89..eceb97734646 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4580,6 +4580,11 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } +static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4688,7 +4693,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; @@ -11481,6 +11486,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u32 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); @@ -11526,8 +11532,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || - (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -11585,7 +11592,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_vcpu_srcu_read_unlock(vcpu);