Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Browse files Browse the repository at this point in the history
Pull kvm fixes from Paolo Bonzini:
 "Fixes for kvm on x86:

   - new selftests

   - fixes for migration with HyperV re-enlightenment enabled

   - fix RCU/SRCU usage

   - fixes for local_irq_restore misuse false positive"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  documentation/kvm: additional explanations on KVM_SET_BOOT_CPU_ID
  x86/kvm: Fix broken irq restoration in kvm_wait
  KVM: X86: Fix missing local pCPU when executing wbinvd on all dirty pCPUs
  KVM: x86: Protect userspace MSR filter with SRCU, and set atomically-ish
  selftests: kvm: add set_boot_cpu_id test
  selftests: kvm: add _vm_ioctl
  selftests: kvm: add get_msr_index_features
  selftests: kvm: Add basic Hyper-V clocksources tests
  KVM: x86: hyper-v: Don't touch TSC page values when guest opted for re-enlightenment
  KVM: x86: hyper-v: Track Hyper-V TSC page status
  KVM: x86: hyper-v: Prevent using not-yet-updated TSC page by secondary CPUs
  KVM: x86: hyper-v: Limit guest to writing zero to HV_X64_MSR_TSC_EMULATION_STATUS
  KVM: x86/mmu: Store the address space ID in the TDP iterator
  KVM: x86/mmu: Factor out tdp_iter_return_to_root
  KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs
  KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page
  • Loading branch information
Linus Torvalds committed Mar 19, 2021
2 parents 3149860 + 9ce3746 commit ecd8ee7
Show file tree
Hide file tree
Showing 18 changed files with 807 additions and 120 deletions.
9 changes: 6 additions & 3 deletions Documentation/virt/kvm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1495,7 +1495,8 @@ Fails if any VCPU has already been created.

Define which vcpu is the Bootstrap Processor (BSP). Values are the same
as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default
is vcpu 0.
is vcpu 0. This ioctl has to be called before vcpu creation,
otherwise it will return EBUSY error.


4.42 KVM_GET_XSAVE
Expand Down Expand Up @@ -4806,8 +4807,10 @@ If an MSR access is not permitted through the filtering, it generates a
allows user space to deflect and potentially handle various MSR accesses
into user space.

If a vCPU is in running state while this ioctl is invoked, the vCPU may
experience inconsistent filtering behavior on MSR accesses.
Note, invoking this ioctl with a vCPU is running is inherently racy. However,
KVM does guarantee that vCPUs will see either the previous filter or the new
filter, e.g. MSRs with identical settings in both the old and new filter will
have deterministic behavior.

4.127 KVM_XEN_HVM_SET_ATTR
--------------------------
Expand Down
34 changes: 26 additions & 8 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -884,12 +884,29 @@ struct kvm_hv_syndbg {
u64 options;
};

/* Current state of Hyper-V TSC page clocksource */
enum hv_tsc_page_status {
/* TSC page was not set up or disabled */
HV_TSC_PAGE_UNSET = 0,
/* TSC page MSR was written by the guest, update pending */
HV_TSC_PAGE_GUEST_CHANGED,
/* TSC page MSR was written by KVM userspace, update pending */
HV_TSC_PAGE_HOST_CHANGED,
/* TSC page was properly set up and is currently active */
HV_TSC_PAGE_SET,
/* TSC page is currently being updated and therefore is inactive */
HV_TSC_PAGE_UPDATING,
/* TSC page was set up with an inaccessible GPA */
HV_TSC_PAGE_BROKEN,
};

/* Hyper-V emulation context */
struct kvm_hv {
struct mutex hv_lock;
u64 hv_guest_os_id;
u64 hv_hypercall;
u64 hv_tsc_page;
enum hv_tsc_page_status hv_tsc_page_status;

/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
Expand Down Expand Up @@ -931,6 +948,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};

struct kvm_x86_msr_filter {
u8 count;
bool default_allow:1;
struct msr_bitmap_range ranges[16];
};

#define APICV_INHIBIT_REASON_DISABLE 0
#define APICV_INHIBIT_REASON_HYPERV 1
#define APICV_INHIBIT_REASON_NESTED 2
Expand Down Expand Up @@ -1025,16 +1048,11 @@ struct kvm_arch {
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;

bool bus_lock_detection_enabled;

/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
u32 user_space_msr_mask;

struct {
u8 count;
bool default_allow:1;
struct msr_bitmap_range ranges[16];
} msr_filter;

bool bus_lock_detection_enabled;
struct kvm_x86_msr_filter __rcu *msr_filter;

struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
Expand Down
23 changes: 10 additions & 13 deletions arch/x86/kernel/kvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu)

static void kvm_wait(u8 *ptr, u8 val)
{
unsigned long flags;

if (in_nmi())
return;

local_irq_save(flags);

if (READ_ONCE(*ptr) != val)
goto out;

/*
* halt until it's our turn and kicked. Note that we do safe halt
* for irq enabled case to avoid hang when lock info is overwritten
* in irq spinlock slowpath and no spurious interrupt occur to save us.
*/
if (arch_irqs_disabled_flags(flags))
halt();
else
safe_halt();
if (irqs_disabled()) {
if (READ_ONCE(*ptr) == val)
halt();
} else {
local_irq_disable();

out:
local_irq_restore(flags);
if (READ_ONCE(*ptr) == val)
safe_halt();

local_irq_enable();
}
}

#ifdef CONFIG_X86_32
Expand Down
91 changes: 81 additions & 10 deletions arch/x86/kvm/hyperv.c
Original file line number Diff line number Diff line change
Expand Up @@ -520,10 +520,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
u64 tsc;

/*
* The guest has not set up the TSC page or the clock isn't
* stable, fall back to get_kvmclock_ns.
* Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
* is broken, disabled or being updated.
*/
if (!hv->tsc_ref.tsc_sequence)
if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
return div_u64(get_kvmclock_ns(kvm), 100);

vcpu = kvm_get_vcpu(kvm, 0);
Expand Down Expand Up @@ -1077,6 +1077,21 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
return true;
}

/*
* Don't touch TSC page values if the guest has opted for TSC emulation after
* migration. KVM doesn't fully support reenlightenment notifications and TSC
* access emulation and Hyper-V is known to expect the values in TSC page to
* stay constant before TSC access emulation is disabled from guest side
* (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
* frequency and guest visible TSC value across migration (and prevent it when
* TSC scaling is unsupported).
*/
static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
{
return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
hv->hv_tsc_emulation_control;
}

void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock)
{
Expand All @@ -1087,7 +1102,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);

if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
return;

mutex_lock(&hv->hv_lock);
Expand All @@ -1101,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
*/
if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
&tsc_seq, sizeof(tsc_seq))))
goto out_err;

if (tsc_seq && tsc_page_update_unsafe(hv)) {
if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
goto out_err;

hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
goto out_unlock;
}

/*
* While we're computing and writing the parameters, force the
Expand All @@ -1110,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = 0;
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
goto out_unlock;
goto out_err;

if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
goto out_unlock;
goto out_err;

/* Ensure sequence is zero before writing the rest of the struct. */
smp_wmb();
if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
goto out_unlock;
goto out_err;

/*
* Now switch to the TSC page mechanism by writing the sequence.
Expand All @@ -1131,8 +1155,45 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
smp_wmb();

hv->tsc_ref.tsc_sequence = tsc_seq;
kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
goto out_err;

hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
goto out_unlock;

out_err:
hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
out_unlock:
mutex_unlock(&hv->hv_lock);
}

void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
u64 gfn;

if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
tsc_page_update_unsafe(hv))
return;

mutex_lock(&hv->hv_lock);

if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;

/* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;

gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;

hv->tsc_ref.tsc_sequence = 0;
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;

out_unlock:
mutex_unlock(&hv->hv_lock);
}
Expand Down Expand Up @@ -1193,8 +1254,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
}
case HV_X64_MSR_REFERENCE_TSC:
hv->hv_tsc_page = data;
if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
if (!host)
hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
else
hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
} else {
hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
}
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
return kvm_hv_msr_set_crash_data(kvm,
Expand Down Expand Up @@ -1229,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_tsc_emulation_control = data;
break;
case HV_X64_MSR_TSC_EMULATION_STATUS:
if (data && !host)
return 1;

hv->hv_tsc_emulation_status = data;
break;
case HV_X64_MSR_TIME_REF_COUNT:
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kvm/hyperv.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);

void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
void kvm_hv_invalidate_tsc_page(struct kvm *kvm);

void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
Expand Down
5 changes: 5 additions & 0 deletions arch/x86/kvm/mmu/mmu_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}

static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
return sp->role.smm ? 1 : 0;
}

static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
{
/*
Expand Down
30 changes: 18 additions & 12 deletions arch/x86/kvm/mmu/tdp_iter.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
return gfn & -KVM_PAGES_PER_HPAGE(level);
}

/*
* Return the TDP iterator to the root PT and allow it to continue its
* traversal over the paging structure from there.
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;

iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);

iter->valid = true;
}

/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
Expand All @@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);

iter->next_last_level_gfn = next_last_level_gfn;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
iter->level = root_level;
iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;

iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));

iter->valid = true;
tdp_iter_restart(iter);
}

/*
Expand Down Expand Up @@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
}

tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
{
return iter->pt_path[iter->root_level - 1];
}

4 changes: 3 additions & 1 deletion arch/x86/kvm/mmu/tdp_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ struct tdp_iter {
int min_level;
/* The iterator's current level within the paging structure */
int level;
/* The address space ID, i.e. SMM vs. regular. */
int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
Expand All @@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);

#endif /* __KVM_X86_MMU_TDP_ITER_H */
Loading

0 comments on commit ecd8ee7

Please sign in to comment.