From d9c5ed0a9b529ee1103a4b68cec86112b26380e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:17 -0800 Subject: [PATCH 01/11] KVM: x86: Don't take kvm->lock when iterating over vCPUs in suspend notifier When queueing vCPU PVCLOCK updates in response to SUSPEND or HIBERNATE, don't take kvm->lock as doing so can trigger a largely theoretical deadlock, it is perfectly safe to iterate over the xarray of vCPUs without holding kvm->lock, and kvm->lock doesn't protect kvm_set_guest_paused() in any way (pv_time.active and pvclock_set_guest_stopped_request are protected by vcpu->mutex, not kvm->lock). Reported-by: syzbot+352e553a86e0d75f5120@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/677c0f36.050a0220.3b3668.0014.GAE@google.com Fixes: 7d62874f69d7 ("kvm: x86: implement KVM PM-notifier") Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e77e61d4fbd..b83d58f9f0b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6907,7 +6907,6 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) unsigned long i; int ret = 0; - mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { if (!vcpu->arch.pv_time.active) continue; @@ -6919,7 +6918,6 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) break; } } - mutex_unlock(&kvm->lock); return ret ? NOTIFY_BAD : NOTIFY_DONE; } From 4198f38aed24f8f4634867f640cc60c65789b4ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:18 -0800 Subject: [PATCH 02/11] KVM: x86: Eliminate "handling" of impossible errors during SUSPEND Drop KVM's handling of kvm_set_guest_paused() failure when reacting to a SUSPEND notification, as kvm_set_guest_paused() only "fails" if the vCPU isn't using kvmclock, and KVM's notifier callback pre-checks that kvmclock is active. I.e. barring some bizarre edge case that shouldn't be treated as an error in the first place, kvm_arch_suspend_notifier() can't fail. Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b83d58f9f0b4..fa409d540d83 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6905,21 +6905,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; - int ret = 0; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time.active) - continue; - ret = kvm_set_guest_paused(vcpu); - if (ret) { - kvm_err("Failed to pause guest VCPU%d: %d\n", - vcpu->vcpu_id, ret); - break; - } - } + /* + * Ignore the return, marking the guest paused only "fails" if the vCPU + * isn't using kvmclock; continuing on is correct and desirable. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + (void)kvm_set_guest_paused(vcpu); - return ret ? NOTIFY_BAD : NOTIFY_DONE; + return NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) From aceb04f571e9fb85d627ec854ea9a012c734fbc2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:19 -0800 Subject: [PATCH 03/11] KVM: x86: Drop local pvclock_flags variable in kvm_guest_time_update() Drop the local pvclock_flags in kvm_guest_time_update(), the local variable is immediately shoved into the per-vCPU "cache", i.e. the local variable serves no purpose. No functional change intended. Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa409d540d83..08e9ea87994b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3178,7 +3178,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; bool use_master_clock; #ifdef CONFIG_KVM_XEN /* @@ -3261,11 +3260,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; + vcpu->hv_clock.flags = 0; if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; - - vcpu->hv_clock.flags = pvclock_flags; + vcpu->hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); From 6c4927a4b7b8a2b308f6183cf3d743370419c6b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:20 -0800 Subject: [PATCH 04/11] KVM: x86: Process "guest stopped request" once per guest time update Handle "guest stopped" requests once per guest time update in preparation of restoring KVM's historical behavior of setting PVCLOCK_GUEST_STOPPED for kvmclock and only kvmclock. For now, simply move the code to minimize the probability of an unintentional change in functionally. Note, in practice, all clocks are guaranteed to see the request (or not) even though each PV clock processes the request individual, as KVM holds vcpu->mutex (blocks KVM_KVMCLOCK_CTRL) and it should be impossible for KVM's suspend notifier to run while KVM is handling requests. And because the helper updates the reference flags, all subsequent PV clock updates will pick up PVCLOCK_GUEST_STOPPED. Note #2, once PVCLOCK_GUEST_STOPPED is restricted to kvmclock, the horrific #ifdef will go away. Cc: Paul Durrant Cc: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08e9ea87994b..5b220e999dd8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3150,11 +3150,6 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); if (force_tsc_unstable) @@ -3264,6 +3259,18 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (use_master_clock) vcpu->hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; + if (vcpu->pv_time.active +#ifdef CONFIG_KVM_XEN + || vcpu->xen.vcpu_info_cache.active + || vcpu->xen.vcpu_time_info_cache.active +#endif + ) { + if (vcpu->pvclock_set_guest_stopped_request) { + vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + } + if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN From ca28aa63918b304d77803d5ae952ed875fdad56b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:21 -0800 Subject: [PATCH 05/11] KVM: x86/xen: Use guest's copy of pvclock when starting timer Use the guest's copy of its pvclock when starting a Xen timer, as KVM's reference copy may not be up-to-date, i.e. may yield a false positive of sorts. In the unlikely scenario that the guest is starting a Xen timer and has used a Xen pvclock in the past, but has since but turned it "off", then vcpu->arch.hv_clock may be stale, as KVM's reference copy is updated if and only if at least one pvclock is enabled. Furthermore, vcpu->arch.hv_clock is currently used by three different pvclocks: kvmclock, Xen, and Xen compat. While it's extremely unlikely a guest would ever enable multiple pvclocks, effectively sharing KVM's reference clock could yield very weird behavior. Using the guest's active Xen pvclock instead of KVM's reference will allow dropping KVM's reference copy. Fixes: 451a707813ae ("KVM: x86/xen: improve accuracy of Xen timers") Cc: Paul Durrant Cc: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 65 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a909b817b9c0..300a79f1fae5 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, + struct pvclock_vcpu_time_info *hv_clock, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) +{ + unsigned long flags; + int r; + + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); + + r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); + if (r) + return r; + + read_lock_irqsave(&gpc->lock, flags); + } + + memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); + read_unlock_irqrestore(&gpc->lock, flags); + + /* + * Sanity check TSC shift+multiplier to verify the guest's view of time + * is more or less consistent. + */ + if (hv_clock->tsc_shift != vcpu->arch.hv_clock.tsc_shift || + hv_clock->tsc_to_system_mul != vcpu->arch.hv_clock.tsc_to_system_mul) + return -EINVAL; + + return 0; +} + static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, bool linux_wa) { + struct kvm_vcpu_xen *xen = &vcpu->arch.xen; int64_t kernel_now, delta; uint64_t guest_now; + int r = -EOPNOTSUPP; /* * The guest provides the requested timeout in absolute nanoseconds @@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, * the absolute CLOCK_MONOTONIC time at which the timer should * fire. */ - if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && - static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + do { + struct pvclock_vcpu_time_info hv_clock; uint64_t host_tsc, guest_tsc; + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) || + !vcpu->kvm->arch.use_master_clock) + break; + + /* + * If both Xen PV clocks are active, arbitrarily try to use the + * compat clock first, but also try to use the non-compat clock + * if the compat clock is unusable. The two PV clocks hold the + * same information, but it's possible one (or both) is stale + * and/or currently unreachable. + */ + if (xen->vcpu_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (r && xen->vcpu_time_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0); + if (r) + break; + if (!IS_ENABLED(CONFIG_64BIT) || !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { /* @@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, /* Calculate the guest kvmclock as the guest would do it. */ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); - guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, - guest_tsc); - } else { + guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc); + } while (0); + + if (r) { /* * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. * From 24c16637802695c1721db6ec16a7acd11f9f9167 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:22 -0800 Subject: [PATCH 06/11] KVM: x86: Don't bleed PVCLOCK_GUEST_STOPPED across PV clocks When updating a specific PV clock, make a full copy of KVM's reference copy/cache so that PVCLOCK_GUEST_STOPPED doesn't bleed across clocks. E.g. in the unlikely scenario the guest has enabled both kvmclock and Xen PV clock, a dangling GUEST_STOPPED in kvmclock would bleed into Xen PV clock. Using a local copy of the pvclock structure also sets the stage for eliminating the per-vCPU copy/cache (only the TSC frequency information actually "needs" to be cached/persisted). Fixes: aa096aa0a05f ("KVM: x86/xen: setup pvclock updates") Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5b220e999dd8..906c60cc811a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3123,8 +3123,11 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info hv_clock; unsigned long flags; + memcpy(&hv_clock, &vcpu->hv_clock, sizeof(hv_clock)); + read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); @@ -3144,25 +3147,25 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, * it is consistent. */ - guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; + guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); + memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); if (force_tsc_unstable) guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; smp_wmb(); - guest_hv_clock->version = ++vcpu->hv_clock.version; + guest_hv_clock->version = ++hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + trace_kvm_pvclock_update(v->vcpu_id, &hv_clock); } static int kvm_guest_time_update(struct kvm_vcpu *v) From 93fb0b10e7121f3be86c4cdd564d7e1c0974deca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:23 -0800 Subject: [PATCH 07/11] KVM: x86: Set PVCLOCK_GUEST_STOPPED only for kvmclock, not for Xen PV clock Handle "guest stopped" propagation only for kvmclock, as the flag is set if and only if kvmclock is "active", i.e. can only be set for Xen PV clock if kvmclock *and* Xen PV clock are in-use by the guest, which creates very bizarre behavior for the guest. Simply restrict the flag to kvmclock, e.g. instead of trying to handle Xen PV clock, as propagation of PVCLOCK_GUEST_STOPPED was unintentionally added during a refactoring, and while Xen proper defines XEN_PVCLOCK_GUEST_STOPPED, there's no evidence that Xen guests actually support the flag. Check and clear pvclock_set_guest_stopped_request if and only if kvmclock is active to preserve the original behavior, i.e. keep the flag pending if kvmclock happens to be disabled when KVM processes the initial request. Fixes: aa096aa0a05f ("KVM: x86/xen: setup pvclock updates") Cc: Paul Durrant Cc: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 906c60cc811a..8fd5922f22a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3262,20 +3262,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (use_master_clock) vcpu->hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; - if (vcpu->pv_time.active -#ifdef CONFIG_KVM_XEN - || vcpu->xen.vcpu_info_cache.active - || vcpu->xen.vcpu_time_info_cache.active -#endif - ) { + if (vcpu->pv_time.active) { + /* + * GUEST_STOPPED is only supported by kvmclock, and KVM's + * historic behavior is to only process the request if kvmclock + * is active/enabled. + */ if (vcpu->pvclock_set_guest_stopped_request) { vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); + + vcpu->hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; } - if (vcpu->pv_time.active) - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, From 46aed4d4a7db828e0019d16c575e93cc318571db Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:24 -0800 Subject: [PATCH 08/11] KVM: x86: Pass reference pvclock as a param to kvm_setup_guest_pvclock() Pass the reference pvclock structure that's used to setup each individual pvclock as a parameter to kvm_setup_guest_pvclock() as a preparatory step toward removing kvm_vcpu_arch.hv_clock. No functional change intended. Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8fd5922f22a6..597abd5429af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3116,17 +3116,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, +static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, + struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, unsigned int offset, bool force_tsc_unstable) { - struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; struct pvclock_vcpu_time_info hv_clock; unsigned long flags; - memcpy(&hv_clock, &vcpu->hv_clock, sizeof(hv_clock)); + memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { @@ -3165,7 +3165,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); - trace_kvm_pvclock_update(v->vcpu_id, &hv_clock); + trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -3272,18 +3272,18 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); + kvm_setup_guest_pvclock(&vcpu->hv_clock, v, &vcpu->pv_time, 0, false); vcpu->hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; } #ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, + kvm_setup_guest_pvclock(&vcpu->hv_clock, v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time), xen_pvclock_tsc_unstable); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, + kvm_setup_guest_pvclock(&vcpu->hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0, xen_pvclock_tsc_unstable); #endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); From 39d61b46adfd784f601534a6d488697bf53f2633 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:25 -0800 Subject: [PATCH 09/11] KVM: x86: Remove per-vCPU "cache" of its reference pvclock Remove the per-vCPU "cache" of the reference pvclock and instead cache only the TSC shift+multiplier. All other fields in pvclock are fully recomputed by kvm_guest_time_update(), i.e. aren't actually persisted. In addition to shaving a few bytes, explicitly tracking the TSC shift/mul fields makes it easier to see that those fields are tied to hw_tsc_khz (they exist to avoid having to do expensive math in the common case). And conversely, not tracking the other fields makes it easier to see that things like the version number are pulled from the guest's copy, not from KVM's reference. Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/x86.c | 27 +++++++++++++++------------ arch/x86/kvm/xen.c | 8 ++++---- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5c..14b2d033925e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -908,7 +908,8 @@ struct kvm_vcpu_arch { int (*complete_userspace_io)(struct kvm_vcpu *vcpu); gpa_t time; - struct pvclock_vcpu_time_info hv_clock; + s8 pvclock_tsc_shift; + u32 pvclock_tsc_mul; unsigned int hw_tsc_khz; struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 597abd5429af..bdc6f449f192 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3170,6 +3170,7 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, static int kvm_guest_time_update(struct kvm_vcpu *v) { + struct pvclock_vcpu_time_info hv_clock = {}; unsigned long flags, tgt_tsc_khz; unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; @@ -3247,20 +3248,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); + &vcpu->pvclock_tsc_shift, + &vcpu->pvclock_tsc_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; kvm_xen_update_tsc_info(v); } - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + hv_clock.tsc_shift = vcpu->pvclock_tsc_shift; + hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul; + hv_clock.tsc_timestamp = tsc_timestamp; + hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - vcpu->hv_clock.flags = 0; + hv_clock.flags = 0; if (use_master_clock) - vcpu->hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; + hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; if (vcpu->pv_time.active) { /* @@ -3269,24 +3272,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * is active/enabled. */ if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } - kvm_setup_guest_pvclock(&vcpu->hv_clock, v, &vcpu->pv_time, 0, false); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0, false); - vcpu->hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; + hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; } #ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) - kvm_setup_guest_pvclock(&vcpu->hv_clock, v, &vcpu->xen.vcpu_info_cache, + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time), xen_pvclock_tsc_unstable); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(&vcpu->hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0, + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0, xen_pvclock_tsc_unstable); #endif - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); return 0; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 300a79f1fae5..2801c7bcc2ef 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -176,8 +176,8 @@ static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, * Sanity check TSC shift+multiplier to verify the guest's view of time * is more or less consistent. */ - if (hv_clock->tsc_shift != vcpu->arch.hv_clock.tsc_shift || - hv_clock->tsc_to_system_mul != vcpu->arch.hv_clock.tsc_to_system_mul) + if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift || + hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul) return -EINVAL; return 0; @@ -2316,8 +2316,8 @@ void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { - entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; - entry->edx = vcpu->arch.hv_clock.tsc_shift; + entry->ecx = vcpu->arch.pvclock_tsc_mul; + entry->edx = vcpu->arch.pvclock_tsc_shift; } entry = kvm_find_cpuid_entry_index(vcpu, function, 2); From 847d68abf10c7dc930551318ea41ec96144b8fd6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:26 -0800 Subject: [PATCH 10/11] KVM: x86: Setup Hyper-V TSC page before Xen PV clocks (during clock update) When updating paravirtual clocks, setup the Hyper-V TSC page before Xen PV clocks. This will allow dropping xen_pvclock_tsc_unstable in favor of simply clearing PVCLOCK_TSC_STABLE_BIT in the reference flags. Reviewed-by: Vitaly Kuznetsov Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bdc6f449f192..eefd68f8a6f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3280,6 +3280,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; } + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); + #ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, @@ -3289,7 +3291,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0, xen_pvclock_tsc_unstable); #endif - kvm_hv_setup_tsc_page(v->kvm, &hv_clock); return 0; } From 1b3c38050b5cc07f6873f244f845fb6c8549ce85 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:38:27 -0800 Subject: [PATCH 11/11] KVM: x86: Override TSC_STABLE flag for Xen PV clocks in kvm_guest_time_update() When updating PV clocks, handle the Xen-specific UNSTABLE_TSC override in the main kvm_guest_time_update() by simply clearing PVCLOCK_TSC_STABLE_BIT in the flags of the reference pvclock structure. Expand the comment to (hopefully) make it obvious that Xen clocks need to be processed after all clocks that care about the TSC_STABLE flag. No functional change intended. Cc: Paul Durrant Cc: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20250201013827.680235-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eefd68f8a6f8..3e9bead6a65f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3119,8 +3119,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, - unsigned int offset, - bool force_tsc_unstable) + unsigned int offset) { struct pvclock_vcpu_time_info *guest_hv_clock; struct pvclock_vcpu_time_info hv_clock; @@ -3155,9 +3154,6 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); - if (force_tsc_unstable) - guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; - smp_wmb(); guest_hv_clock->version = ++hv_clock.version; @@ -3178,16 +3174,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) s64 kernel_ns; u64 tsc_timestamp, host_tsc; bool use_master_clock; -#ifdef CONFIG_KVM_XEN - /* - * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless - * explicitly told to use TSC as its clocksource Xen will not set this bit. - * This default behaviour led to bugs in some guest kernels which cause - * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. - */ - bool xen_pvclock_tsc_unstable = - ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; -#endif kernel_ns = 0; host_tsc = 0; @@ -3275,7 +3261,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } - kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0, false); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0); hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; } @@ -3283,13 +3269,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kvm_hv_setup_tsc_page(v->kvm, &hv_clock); #ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + * + * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters! + */ + if (ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT; + if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time), - xen_pvclock_tsc_unstable); + offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0, - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0); #endif return 0; }