Skip to content

Commit

Permalink
Merge tag 'kvm-x86-pvclock-6.15' of https://github.com/kvm-x86/linux
Browse files Browse the repository at this point in the history
…into HEAD

KVM PV clock changes for 6.15:

 - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
   fix a largely theoretical deadlock.

 - Use the vCPU's actual Xen PV clock information when starting the Xen timer,
   as the cached state in arch.hv_clock can be stale/bogus.

 - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
   PV clocks.

 - Restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only
   accounts for kvmclock, and there's no evidence that the flag is actually
   supported by Xen guests.

 - Clean up the per-vCPU "cache" of its reference pvclock, and instead only
   track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
   expensive to compute, and rarely changes for modern setups).
  • Loading branch information
Paolo Bonzini committed Mar 19, 2025
2 parents 9b093f5 + 1b3c380 commit fcce7c1
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 66 deletions.
3 changes: 2 additions & 1 deletion arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,8 @@ struct kvm_vcpu_arch {
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);

gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
s8 pvclock_tsc_shift;
u32 pvclock_tsc_mul;
unsigned int hw_tsc_khz;
struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
Expand Down
115 changes: 57 additions & 58 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -3121,15 +3121,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return data.clock;
}

static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
struct kvm_vcpu *vcpu,
struct gfn_to_pfn_cache *gpc,
unsigned int offset,
bool force_tsc_unstable)
unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info *guest_hv_clock;
struct pvclock_vcpu_time_info hv_clock;
unsigned long flags;

memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));

read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);
Expand All @@ -3149,52 +3151,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
* it is consistent.
*/

guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
smp_wmb();

/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);

if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}

memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);

if (force_tsc_unstable)
guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock));

smp_wmb();

guest_hv_clock->version = ++vcpu->hv_clock.version;
guest_hv_clock->version = ++hv_clock.version;

kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);

trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
struct pvclock_vcpu_time_info hv_clock = {};
unsigned long flags, tgt_tsc_khz;
unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
u8 pvclock_flags;
bool use_master_clock;
#ifdef CONFIG_KVM_XEN
/*
* For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
* explicitly told to use TSC as its clocksource Xen will not set this bit.
* This default behaviour led to bugs in some guest kernels which cause
* problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
*/
bool xen_pvclock_tsc_unstable =
ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
#endif

kernel_ns = 0;
host_tsc = 0;
Expand Down Expand Up @@ -3255,35 +3239,58 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)

if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
&vcpu->pvclock_tsc_shift,
&vcpu->pvclock_tsc_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
kvm_xen_update_tsc_info(v);
}

vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
hv_clock.tsc_timestamp = tsc_timestamp;
hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;

/* If the host uses TSC clocksource, then it is stable */
pvclock_flags = 0;
hv_clock.flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT;

if (vcpu->pv_time.active) {
/*
* GUEST_STOPPED is only supported by kvmclock, and KVM's
* historic behavior is to only process the request if kvmclock
* is active/enabled.
*/
if (vcpu->pvclock_set_guest_stopped_request) {
hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}
kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);

vcpu->hv_clock.flags = pvclock_flags;
hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED;
}

kvm_hv_setup_tsc_page(v->kvm, &hv_clock);

if (vcpu->pv_time.active)
kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
#ifdef CONFIG_KVM_XEN
/*
* For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
* explicitly told to use TSC as its clocksource Xen will not set this bit.
* This default behaviour led to bugs in some guest kernels which cause
* problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
*
* Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters!
*/
if (ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT;

if (vcpu->xen.vcpu_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
offsetof(struct compat_vcpu_info, time),
xen_pvclock_tsc_unstable);
kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
xen_pvclock_tsc_unstable);
kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
#endif
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}

Expand Down Expand Up @@ -6910,23 +6917,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
int ret = 0;

mutex_lock(&kvm->lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!vcpu->arch.pv_time.active)
continue;

ret = kvm_set_guest_paused(vcpu);
if (ret) {
kvm_err("Failed to pause guest VCPU%d: %d\n",
vcpu->vcpu_id, ret);
break;
}
}
mutex_unlock(&kvm->lock);
/*
* Ignore the return, marking the guest paused only "fails" if the vCPU
* isn't using kvmclock; continuing on is correct and desirable.
*/
kvm_for_each_vcpu(i, vcpu, kvm)
(void)kvm_set_guest_paused(vcpu);

return ret ? NOTIFY_BAD : NOTIFY_DONE;
return NOTIFY_DONE;
}

int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
Expand Down
69 changes: 62 additions & 7 deletions arch/x86/kvm/xen.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
return HRTIMER_NORESTART;
}

static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
struct pvclock_vcpu_time_info *hv_clock,
struct gfn_to_pfn_cache *gpc,
unsigned int offset)
{
unsigned long flags;
int r;

read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);

r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
if (r)
return r;

read_lock_irqsave(&gpc->lock, flags);
}

memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
read_unlock_irqrestore(&gpc->lock, flags);

/*
* Sanity check TSC shift+multiplier to verify the guest's view of time
* is more or less consistent.
*/
if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
return -EINVAL;

return 0;
}

static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
bool linux_wa)
{
struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
int64_t kernel_now, delta;
uint64_t guest_now;
int r = -EOPNOTSUPP;

/*
* The guest provides the requested timeout in absolute nanoseconds
Expand All @@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
* the absolute CLOCK_MONOTONIC time at which the timer should
* fire.
*/
if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
do {
struct pvclock_vcpu_time_info hv_clock;
uint64_t host_tsc, guest_tsc;

if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
!vcpu->kvm->arch.use_master_clock)
break;

/*
* If both Xen PV clocks are active, arbitrarily try to use the
* compat clock first, but also try to use the non-compat clock
* if the compat clock is unusable. The two PV clocks hold the
* same information, but it's possible one (or both) is stale
* and/or currently unreachable.
*/
if (xen->vcpu_info_cache.active)
r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
offsetof(struct compat_vcpu_info, time));
if (r && xen->vcpu_time_info_cache.active)
r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
if (r)
break;

if (!IS_ENABLED(CONFIG_64BIT) ||
!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
/*
Expand All @@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,

/* Calculate the guest kvmclock as the guest would do it. */
guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
guest_tsc);
} else {
guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
} while (0);

if (r) {
/*
* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
*
Expand Down Expand Up @@ -2261,8 +2316,8 @@ void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)

entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
if (entry) {
entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
entry->edx = vcpu->arch.hv_clock.tsc_shift;
entry->ecx = vcpu->arch.pvclock_tsc_mul;
entry->edx = vcpu->arch.pvclock_tsc_shift;
}

entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
Expand Down

0 comments on commit fcce7c1

Please sign in to comment.