From 7adacf5eb2d2048045d9fd8fdab861fd9e7e2e96 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 4 Dec 2019 15:50:27 +0100 Subject: [PATCH 001/204] KVM: x86: use CPUID to locate host page table reserved bits The comment in kvm_get_shadow_phys_bits refers to MKTME, but the same is actually true of SME and SEV. Just use CPUID[0x8000_0008].EAX[7:0] unconditionally if available, it is simplest and works even if memory is not encrypted. Cc: stable@vger.kernel.org Reported-by: Tom Lendacky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f92b40d798ca..1e4ee4f8de5f0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static u8 kvm_get_shadow_phys_bits(void) { /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected - * in CPU detection code, but MKTME treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore for MKTME - * we should still return physical address bits reported by CPUID. + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. */ - if (!boot_cpu_has(X86_FEATURE_TME) || - WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) - return boot_cpu_data.x86_phys_bits; + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; - return cpuid_eax(0x80000008) & 0xff; + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; } static void kvm_mmu_reset_all_pte_masks(void) From 4fb7b452ce7b1490e789bf6fabd4f397cf57a26f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 5 Dec 2019 10:24:38 +0800 Subject: [PATCH 002/204] KVM: vmx: remove unreachable statement in vmx_get_msr_feature() We have no way to reach the final statement, remove it. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e3394c839dea6..5fb7a1695a24a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1773,8 +1773,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) default: return 1; } - - return 0; } /* From d29c03a58705417561a5d2bacec4194ef6485ece Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 5 Dec 2019 11:05:05 +0800 Subject: [PATCH 003/204] KVM: get rid of var page in kvm_set_pfn_dirty() We can get rid of unnecessary var page in kvm_set_pfn_dirty() , thus make code style similar with kvm_set_pfn_accessed(). Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 00268290dcbd8..3aa21bec028d6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1931,11 +1931,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); - - SetPageDirty(page); - } + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) + SetPageDirty(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); From fe3c2b4c228443b505f2d8981c4871b96cfec6d6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 5 Dec 2019 11:40:16 +0800 Subject: [PATCH 004/204] KVM: explicitly set rmap_head->val to 0 in pte_list_desc_remove_entry() When we reach here, we have desc->sptes[j] = NULL with j = 0. So we can replace desc->sptes[0] with 0 to make it more clear. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1e4ee4f8de5f0..35d37099bec66 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1414,7 +1414,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, if (j != 0) return; if (!prev_desc && !desc->more) - rmap_head->val = (unsigned long)desc->sptes[0]; + rmap_head->val = 0; else if (prev_desc) prev_desc->more = desc->more; From b4b2963616bbd91ebb33148522552e1135de56ae Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:16 +0100 Subject: [PATCH 005/204] KVM: X86: Fix kvm_bitmap_or_dest_vcpus() to use irq shorthand The 3rd parameter of kvm_apic_match_dest() is the irq shorthand, rather than the irq delivery mode. Fixes: 7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs") Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf9177b4a07f9..1eabe58bb6d5e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1151,7 +1151,7 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, NULL, - irq->delivery_mode, + irq->shorthand, irq->dest_id, irq->dest_mode)) continue; From 59508b303e4e35de9dd708ec87b1e89b1f3c1616 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:17 +0100 Subject: [PATCH 006/204] KVM: X86: Move irrelevant declarations out of ioapic.h kvm_apic_match_dest() is declared in both ioapic.h and lapic.h. Remove the declaration in ioapic.h. kvm_apic_compare_prio() is declared in ioapic.h but defined in lapic.c. Move the declaration to lapic.h. kvm_irq_delivery_to_apic() is declared in ioapic.h but defined in irq_comm.c. Move the declaration to irq.h. hyperv.c needs to use kvm_irq_delivery_to_apic(). Include irq.h in hyperv.c. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 + arch/x86/kvm/ioapic.h | 6 ------ arch/x86/kvm/irq.h | 3 +++ arch/x86/kvm/lapic.h | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 23ff65504d7e5..c7d4640b7b1c1 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -33,6 +33,7 @@ #include #include "trace.h" +#include "irq.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ea1a4e0297dae..2fb2e3c807247 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -116,9 +116,6 @@ static inline int ioapic_in_kernel(struct kvm *kvm) } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); -bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode); -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); @@ -126,9 +123,6 @@ void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7c6233d37c644..f173ab6b407e0 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -113,5 +113,8 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu); int kvm_setup_default_irq_routing(struct kvm *kvm); int kvm_setup_empty_irq_routing(struct kvm *kvm); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); #endif diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 39925afdfcdc8..0b9bbadd1f3ca 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -83,7 +83,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); - +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); From c96001c5702e66b64e0ffe533aa19d6567ce15bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:18 +0100 Subject: [PATCH 007/204] KVM: X86: Use APIC_DEST_* macros properly in kvm_lapic_irq.dest_mode We were using either APIC_DEST_PHYSICAL|APIC_DEST_LOGICAL or 0|1 to fill in kvm_lapic_irq.dest_mode. It's fine only because in most cases when we check against dest_mode it's against APIC_DEST_PHYSICAL (which equals to 0). However, that's not consistent. We'll have problem when we want to start checking against APIC_DEST_LOGICAL, which does not equals to 1. This patch firstly introduces kvm_lapic_irq_dest_mode() helper to take any boolean of destination mode and return the APIC_DEST_* macro. Then, it replaces the 0|1 settings of irq.dest_mode with the helper. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/ioapic.c | 9 ++++++--- arch/x86/kvm/irq_comm.c | 7 ++++--- arch/x86/kvm/x86.c | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b79cd6aa40756..2893eae5df9f4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1022,6 +1022,11 @@ struct kvm_lapic_irq { bool msi_redir_hint; }; +static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) +{ + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; +} + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 9fd2dd89a1c5e..e623a4f8d27e8 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -331,7 +331,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) irq.vector = e->fields.vector; irq.delivery_mode = e->fields.delivery_mode << 8; irq.dest_id = e->fields.dest_id; - irq.dest_mode = e->fields.dest_mode; + irq.dest_mode = + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); bitmap_zero(&vcpu_bitmap, 16); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); @@ -343,7 +344,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) * keep ioapic_handled_vectors synchronized. */ irq.dest_id = old_dest_id; - irq.dest_mode = old_dest_mode; + irq.dest_mode = + kvm_lapic_irq_dest_mode( + !!e->fields.dest_mode); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); } @@ -369,7 +372,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.dest_id = entry->fields.dest_id; irqe.vector = entry->fields.vector; - irqe.dest_mode = entry->fields.dest_mode; + irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode); irqe.trig_mode = entry->fields.trig_mode; irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8ecd48d31800a..22108ed66a765 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -52,8 +52,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; - if (irq->dest_mode == 0 && irq->dest_id == 0xff && - kvm_lowest_prio_delivery(irq)) { + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -114,7 +114,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq->dest_mode = kvm_lapic_irq_dest_mode( + !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; irq->delivery_mode = e->msi.data & 0x700; irq->msi_redir_hint = ((e->msi.address_lo diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf917139de6ba..f4e477e6c9544 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7358,7 +7358,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) struct kvm_lapic_irq lapic_irq; lapic_irq.shorthand = 0; - lapic_irq.dest_mode = 0; + lapic_irq.dest_mode = APIC_DEST_PHYSICAL; lapic_irq.level = 0; lapic_irq.dest_id = apicid; lapic_irq.msi_redir_hint = false; From ac8ef992cd02cdb8290ca788746d283fe3092500 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:19 +0100 Subject: [PATCH 008/204] KVM: X86: Drop KVM_APIC_SHORT_MASK and KVM_APIC_DEST_MASK We have both APIC_SHORT_MASK and KVM_APIC_SHORT_MASK defined for the shorthand mask. Similarly, we have both APIC_DEST_MASK and KVM_APIC_DEST_MASK defined for the destination mode mask. Drop the KVM_APIC_* macros and replace the only user of them to use the APIC_DEST_* macros instead. At the meantime, move APIC_SHORT_MASK and APIC_DEST_MASK from lapic.c to lapic.h. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 --- arch/x86/kvm/lapic.h | 5 +++-- arch/x86/kvm/svm.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eabe58bb6d5e..805c18178bbf9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -56,9 +56,6 @@ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ -#define APIC_SHORT_MASK 0xc0000 -#define APIC_DEST_NOSHORT 0x0 -#define APIC_DEST_MASK 0x800 #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0b9bbadd1f3ca..5a9f29ed9a4ba 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,8 +10,9 @@ #define KVM_APIC_SIPI 1 #define KVM_APIC_LVT_NUM 6 -#define KVM_APIC_SHORT_MASK 0xc0000 -#define KVM_APIC_DEST_MASK 0x800 +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 #define APIC_BUS_CYCLE_NS 1 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 122d4ce3b1ab0..8f1b715dfde83 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4519,9 +4519,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) */ kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, apic, - icrl & KVM_APIC_SHORT_MASK, + icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & KVM_APIC_DEST_MASK); + icrl & APIC_DEST_MASK); if (m && !avic_vcpu_is_running(vcpu)) kvm_vcpu_wake_up(vcpu); From 5c69d5c113f15a7a9956185b815d14d50f3efad4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:20 +0100 Subject: [PATCH 009/204] KVM: X86: Fix callers of kvm_apic_match_dest() to use correct macros Callers of kvm_apic_match_dest() should always pass in APIC_DEST_* macros for either dest_mode and short_hand parameters. Fix up all the callers of kvm_apic_match_dest() that are not following the rule. Since at it, rename the parameter from short_hand to shorthand in kvm_apic_match_dest(), as suggested by Vitaly. Reported-by: Sean Christopherson Reported-by: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 11 +++++++---- arch/x86/kvm/irq_comm.c | 3 ++- arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/lapic.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index e623a4f8d27e8..f53daeaaeb379 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -108,8 +108,9 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; - if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, - e->fields.dest_mode)) + if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + e->fields.dest_id, + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode))) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); @@ -250,8 +251,10 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { - if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode) || + u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); + + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + e->fields.dest_id, dm) || kvm_apic_pending_eoi(vcpu, e->fields.vector)) __set_bit(e->fields.vector, ioapic_handled_vectors); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 22108ed66a765..7d083f71fc8ea 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -417,7 +417,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + if (irq.level && + kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, irq.dest_id, irq.dest_mode)) __set_bit(irq.vector, ioapic_handled_vectors); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 805c18178bbf9..679692b55f6d8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -789,13 +789,13 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode) + int shorthand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); ASSERT(target); - switch (short_hand) { + switch (shorthand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) return kvm_apic_match_physical_addr(target, mda); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 5a9f29ed9a4ba..ec730ce7a3444 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -83,7 +83,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode); + int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); From 150a84fee84fbaf2a2a6c76c44ae027b5c7d151a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:21 +0100 Subject: [PATCH 010/204] KVM: X86: Convert the last users of "shorthand = 0" to use macros Change the last users of "shorthand = 0" to use APIC_DEST_NOSHORT. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 4 ++-- arch/x86/kvm/irq_comm.c | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index f53daeaaeb379..77538fd77dc25 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -330,7 +330,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; - irq.shorthand = 0; + irq.shorthand = APIC_DEST_NOSHORT; irq.vector = e->fields.vector; irq.delivery_mode = e->fields.delivery_mode << 8; irq.dest_id = e->fields.dest_id; @@ -379,7 +379,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.trig_mode = entry->fields.trig_mode; irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; - irqe.shorthand = 0; + irqe.shorthand = APIC_DEST_NOSHORT; irqe.msi_redir_hint = false; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 7d083f71fc8ea..9d711c2451c79 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -121,7 +121,7 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->msi_redir_hint = ((e->msi.address_lo & MSI_ADDR_REDIRECTION_LOWPRI) > 0); irq->level = 1; - irq->shorthand = 0; + irq->shorthand = APIC_DEST_NOSHORT; } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4e477e6c9544..16902d0aad3ab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7357,7 +7357,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) { struct kvm_lapic_irq lapic_irq; - lapic_irq.shorthand = 0; + lapic_irq.shorthand = APIC_DEST_NOSHORT; lapic_irq.dest_mode = APIC_DEST_PHYSICAL; lapic_irq.level = 0; lapic_irq.dest_id = apicid; From 0a03cbdac115fdcc06fd9d05ce3c389d0ead9a71 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 6 Dec 2019 16:20:18 +0800 Subject: [PATCH 011/204] KVM: x86: Fix some comment typos Fix some typos in comment. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 35d37099bec66..c19f3ccaace31 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1529,7 +1529,7 @@ struct rmap_iterator { /* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the - * information in the itererator may not be valid. + * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16902d0aad3ab..3051324f72d33 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9792,7 +9792,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * The reason is, in case of PML, we need to set D-bit for any slots * with dirty logging disabled in order to eliminate unnecessary GPA - * logging in PML buffer (and potential PML buffer full VMEXT). This + * logging in PML buffer (and potential PML buffer full VMEXIT). This * guarantees leaving PML enabled during guest's lifetime won't have * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. From 9dadc2f918df26e64aa04794cdb4d8667c934f47 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 6 Dec 2019 16:45:24 +0800 Subject: [PATCH 012/204] KVM: VMX: Rename INTERRUPT_PENDING to INTERRUPT_WINDOW Rename interrupt-windown exiting related definitions to match the latest Intel SDM. No functional changes. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/include/uapi/asm/vmx.h | 4 ++-- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 10 +++++----- tools/arch/x86/include/uapi/asm/vmx.h | 4 ++-- tools/testing/selftests/kvm/include/x86_64/vmx.h | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 1835767aa3356..5acda8d9b9a7c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -19,7 +19,7 @@ /* * Definitions of Primary Processor-Based VM-Execution Controls. */ -#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 3eb8411ab60ef..e95b72ec19bc0 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -33,7 +33,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 -#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 @@ -94,7 +94,7 @@ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4aea7d304bebd..a77e92bd3f725 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2172,7 +2172,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * EXEC CONTROLS */ exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; @@ -3183,7 +3183,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, u32 exit_qual; evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_VIRTUAL_NMI_PENDING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); @@ -3408,7 +3408,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && + !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; return kvm_vcpu_halt(vcpu); @@ -5524,8 +5524,8 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; case EXIT_REASON_TRIPLE_FAULT: return true; - case EXIT_REASON_PENDING_INTERRUPT: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); + case EXIT_REASON_INTERRUPT_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); case EXIT_REASON_TASK_SWITCH: @@ -6015,7 +6015,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5fb7a1695a24a..0693dd0b5dbcd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4350,7 +4350,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void enable_irq_window(struct kvm_vcpu *vcpu) { - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -4969,7 +4969,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5203,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); intr_window_requested = exec_controls_get(vmx) & - CPU_BASED_VIRTUAL_INTR_PENDING; + CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) @@ -5527,7 +5527,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_CPUID] = kvm_emulate_cpuid, [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, @@ -5907,7 +5907,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return kvm_emulate_wrmsr(vcpu); else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) return handle_interrupt_window(vcpu); else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) return handle_external_interrupt(vcpu); diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 3eb8411ab60ef..e95b72ec19bc0 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -33,7 +33,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 -#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 @@ -94,7 +94,7 @@ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index f52e0ba84fedb..c6e442d7a2413 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -18,7 +18,7 @@ /* * Definitions of Primary Processor-Based VM-Execution Controls. */ -#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 @@ -103,7 +103,7 @@ #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 -#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 From 4e2a0bc56ad197e5ccfab8395649b681067fe8cb Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 6 Dec 2019 16:45:25 +0800 Subject: [PATCH 013/204] KVM: VMX: Rename NMI_PENDING to NMI_WINDOW Rename the NMI-window exiting related definitions to match the latest Intel SDM. No functional changes. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 4 ++-- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5acda8d9b9a7c..06d4420508c59 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,7 +31,7 @@ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 -#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 +#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a77e92bd3f725..f8b9da53191e7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2173,7 +2173,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) */ exec_control = vmx_exec_control(vmx); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; @@ -2566,7 +2566,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) return -EINVAL; if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) + nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) return -EINVAL; return 0; @@ -3183,7 +3183,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, u32 exit_qual; evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_VIRTUAL_NMI_PENDING); + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); @@ -3407,7 +3407,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && + !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; @@ -5527,7 +5527,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_INTERRUPT_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); + return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: @@ -6016,7 +6016,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0693dd0b5dbcd..51d8b2043dd0f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4361,7 +4361,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -5182,7 +5182,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!enable_vnmi); - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index c6e442d7a2413..7eb38451c3596 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -30,7 +30,7 @@ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 -#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 +#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 From 5e3d394fdd9e6b49cd8b28d85adff100a5bddc66 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 6 Dec 2019 16:45:26 +0800 Subject: [PATCH 014/204] KVM: VMX: Fix the spelling of CPU_BASED_USE_TSC_OFFSETTING The mis-spelling is found by checkpatch.pl, so fix them. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx/nested.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 6 +++--- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 +- tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 06d4420508c59..d716fe938fc03 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -20,7 +20,7 @@ * Definitions of Primary Processor-Based VM-Execution Controls. */ #define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 -#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 #define CPU_BASED_MWAIT_EXITING 0x00000400 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f8b9da53191e7..8c215da368b79 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3230,7 +3230,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) @@ -3294,7 +3294,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * 26.7 "VM-entry failures during or after loading guest state". */ vmentry_fail_vmexit_guest_mode: - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); @@ -4209,7 +4209,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; if (likely(!vmx->fail)) { @@ -6016,7 +6016,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | - CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 51d8b2043dd0f..b5a0c2e05825e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1716,7 +1716,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) return vcpu->arch.tsc_offset - vmcs12->tsc_offset; return vcpu->arch.tsc_offset; @@ -1734,7 +1734,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * to the newly set TSC to get L2's TSC. */ if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) g_tsc_offset = vmcs12->tsc_offset; trace_kvm_write_tsc_offset(vcpu->vcpu_id, @@ -2353,7 +2353,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 7eb38451c3596..3d27069b9ed9c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -19,7 +19,7 @@ * Definitions of Primary Processor-Based VM-Execution Controls. */ #define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 -#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 #define CPU_BASED_MWAIT_EXITING 0x00000400 diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 5590fd2bcf87d..69e482a95c47f 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -98,7 +98,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING; + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); From dd2d6042b7f4a5440705b4ffc6c4c2dba81a43b7 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 6 Dec 2019 15:46:35 -0800 Subject: [PATCH 015/204] kvm: nVMX: VMWRITE checks VMCS-link pointer before VMCS field According to the SDM, a VMWRITE in VMX non-root operation with an invalid VMCS-link pointer results in VMfailInvalid before the validity of the VMCS field in the secondary source operand is checked. For consistency, modify both handle_vmwrite and handle_vmread, even though there was no problem with the latter. Fixes: 6d894f498f5d1 ("KVM: nVMX: vmread/vmwrite: Use shadow vmcs12 if running L2") Signed-off-by: Jim Mattson Cc: Liran Alon Cc: Paolo Bonzini Cc: Vitaly Kuznetsov Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Reviewed-by: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 59 +++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8c215da368b79..a3ec92c316f6a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4753,32 +4753,28 @@ static int handle_vmread(struct kvm_vcpu *vcpu) { unsigned long field; u64 field_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); int len; gva_t gva = 0; - struct vmcs12 *vmcs12; + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); struct x86_exception e; short offset; if (!nested_vmx_check_permission(vcpu)) return 1; - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + /* + * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == -1ull || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMREAD - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); @@ -4855,13 +4851,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; - struct vmcs12 *vmcs12; + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); short offset; if (!nested_vmx_check_permission(vcpu)) return 1; - if (vmx->nested.current_vmptr == -1ull) + /* + * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * any VMWRITE sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == -1ull || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); if (vmx_instruction_info & (1u << 10)) @@ -4889,24 +4892,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - if (!is_guest_mode(vcpu)) { - vmcs12 = get_vmcs12(vcpu); - - /* - * Ensure vmcs12 is up-to-date before any VMWRITE that dirties - * vmcs12, else we may crush a field or consume a stale value. - */ - if (!is_shadow_field_rw(field)) - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - } else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } + /* + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties + * vmcs12, else we may crush a field or consume a stale value. + */ + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); offset = vmcs_field_to_offset(field); if (offset < 0) From 693e02cc24090c379217138719d9d84e50036b24 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 6 Dec 2019 15:46:36 -0800 Subject: [PATCH 016/204] kvm: nVMX: VMWRITE checks unsupported field before read-only field According to the SDM, VMWRITE checks to see if the secondary source operand corresponds to an unsupported VMCS field before it checks to see if the secondary source operand corresponds to a VM-exit information field and the processor does not support writing to VM-exit information fields. Fixes: 49f705c5324aa ("KVM: nVMX: Implement VMREAD and VMWRITE") Signed-off-by: Jim Mattson Cc: Paolo Bonzini Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Reviewed-by: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a3ec92c316f6a..e2fa5aefed29b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4883,6 +4883,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + offset = vmcs_field_to_offset(field); + if (offset < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* * If the vCPU supports "VMWRITE to any supported field in the * VMCS," then the "read-only" fields are actually read/write. @@ -4899,11 +4905,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - offset = vmcs_field_to_offset(field); - if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); - /* * Some Intel CPUs intentionally drop the reserved bits of the AR byte * fields on VMWRITE. Emulate this behavior to ensure consistent KVM From c90f4d03cce1814b4e08372359116710bbaccce3 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 6 Dec 2019 15:46:37 -0800 Subject: [PATCH 017/204] kvm: nVMX: Aesthetic cleanup of handle_vmread and handle_vmwrite Apply reverse fir tree declaration order, shorten some variable names to avoid line wrap, reformat a block comment, delete an extra blank line, and use BIT(10) instead of (1u << 10). Signed-off-by: Jim Mattson Cc: Paolo Bonzini Cc: Sean Christopherson Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Reviewed-by: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 70 +++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e2fa5aefed29b..7b01ef1d87e66 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4751,17 +4751,17 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) static int handle_vmread(struct kvm_vcpu *vcpu) { - unsigned long field; - u64 field_value; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - int len; - gva_t gva = 0; struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; + unsigned long field; + u64 value; + gva_t gva = 0; short offset; + int len; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4776,7 +4776,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -4786,24 +4786,23 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - /* Read the field, zero-extended to a u64 field_value */ - field_value = vmcs12_read_any(vmcs12, field, offset); + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length. */ - if (vmx_instruction_info & (1u << 10)) { - kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), - field_value); + if (instr_info & BIT(10)) { + kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, len, &gva)) + instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) + if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) kvm_inject_page_fault(vcpu, &e); } @@ -4836,24 +4835,25 @@ static bool is_shadow_field_ro(unsigned long field) static int handle_vmwrite(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct x86_exception e; unsigned long field; - int len; + short offset; gva_t gva; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + int len; - /* The value to write might be 32 or 64 bits, depending on L1's long + /* + * The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the appropriate number of + * bit (value), and then copies only the appropriate number of * bits into the vmcs12 field. */ - u64 field_value = 0; - struct x86_exception e; - struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) - : get_vmcs12(vcpu); - short offset; + u64 value = 0; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4867,22 +4867,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); - if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_readl(vcpu, - (((vmx_instruction_info) >> 3) & 0xf)); + if (instr_info & BIT(10)) + value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, len, &gva)) + instr_info, false, len, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { + if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } - - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -4914,9 +4912,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * the stripped down value, L2 sees the full value as stored by KVM). */ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) - field_value &= 0x1f0ff; + value &= 0x1f0ff; - vmcs12_write_any(vmcs12, field, offset, field_value); + vmcs12_write_any(vmcs12, field, offset, value); /* * Do not track vmcs12 dirty-state if in guest-mode as we actually @@ -4933,7 +4931,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) preempt_disable(); vmcs_load(vmx->vmcs01.shadow_vmcs); - __vmcs_writel(field, field_value); + __vmcs_writel(field, value); vmcs_clear(vmx->vmcs01.shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); From 885f7d6cb87eb15d62613c05d8012e9370fb5e27 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 6 Dec 2019 18:45:52 +0800 Subject: [PATCH 018/204] KVM: Remove duplicated declaration of kvm_vcpu_kick There are two declarations of kvm_vcpu_kick() in kvm_host.h where one of them is redundant. Remove to keep the git grep a bit cleaner. Reviewed-by: Cornelia Huck Signed-off-by: Zenghui Yu Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 538c25e778c07..0d632a75fce95 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -982,7 +982,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); From 8262fe85b4edc5fb3dd7b9520bf5c6b4f027fa55 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 6 Dec 2019 10:53:52 +0800 Subject: [PATCH 019/204] KVM: lib: use jump label to handle resource release in irq_bypass_register_consumer() Use out_err jump label to handle resource release. It's a good practice to release resource in one place and help eliminate some duplicated code. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- virt/lib/irqbypass.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 43de8ae78fa17..cb0c801b3bc23 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -179,6 +179,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) { struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; + int ret; if (!consumer->token || !consumer->add_producer || !consumer->del_producer) @@ -193,20 +194,16 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) list_for_each_entry(tmp, &consumers, node) { if (tmp->token == consumer->token || tmp == consumer) { - mutex_unlock(&lock); - module_put(THIS_MODULE); - return -EBUSY; + ret = -EBUSY; + goto out_err; } } list_for_each_entry(producer, &producers, node) { if (producer->token == consumer->token) { - int ret = __connect(producer, consumer); - if (ret) { - mutex_unlock(&lock); - module_put(THIS_MODULE); - return ret; - } + ret = __connect(producer, consumer); + if (ret) + goto out_err; break; } } @@ -216,6 +213,10 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) mutex_unlock(&lock); return 0; +out_err: + mutex_unlock(&lock); + module_put(THIS_MODULE); + return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); From bbfdafa860bb41344fad8fea06b78c74ec79e181 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 6 Dec 2019 10:53:53 +0800 Subject: [PATCH 020/204] KVM: lib: use jump label to handle resource release in irq_bypass_register_producer() Use out_err jump label to handle resource release. It's a good practice to release resource in one place and help eliminate some duplicated code. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- virt/lib/irqbypass.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index cb0c801b3bc23..28fda42e471bb 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -85,6 +85,7 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) { struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; + int ret; if (!producer->token) return -EINVAL; @@ -98,20 +99,16 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) list_for_each_entry(tmp, &producers, node) { if (tmp->token == producer->token) { - mutex_unlock(&lock); - module_put(THIS_MODULE); - return -EBUSY; + ret = -EBUSY; + goto out_err; } } list_for_each_entry(consumer, &consumers, node) { if (consumer->token == producer->token) { - int ret = __connect(producer, consumer); - if (ret) { - mutex_unlock(&lock); - module_put(THIS_MODULE); - return ret; - } + ret = __connect(producer, consumer); + if (ret) + goto out_err; break; } } @@ -121,6 +118,10 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) mutex_unlock(&lock); return 0; +out_err: + mutex_unlock(&lock); + module_put(THIS_MODULE); + return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); From f958bd2314d117f8c29f4821401bc1925bc2e5ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 9 Dec 2019 12:19:31 -0800 Subject: [PATCH 021/204] KVM: x86: Fix potential put_fpu() w/o load_fpu() on MPX platform Unlike most state managed by XSAVE, MPX is initialized to zero on INIT. Because INITs are usually recognized in the context of a VCPU_RUN call, kvm_vcpu_reset() puts the guest's FPU so that the FPU state is resident in memory, zeros the MPX state, and reloads FPU state to hardware. But, in the unlikely event that an INIT is recognized during kvm_arch_vcpu_ioctl_get_mpstate() via kvm_apic_accept_events(), kvm_vcpu_reset() will call kvm_put_guest_fpu() without a preceding kvm_load_guest_fpu() and corrupt the guest's FPU state (and possibly userspace's FPU state as well). Given that MPX is being removed from the kernel[*], fix the bug with the simple-but-ugly approach of loading the guest's FPU during KVM_GET_MP_STATE. [*] See commit f240652b6032b ("x86/mpx: Remove MPX APIs"). Fixes: f775b13eedee2 ("x86,kvm: move qemu/guest FPU switching out to vcpu_run") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3051324f72d33..0af5cb637beaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8714,6 +8714,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu_load(vcpu); + if (kvm_mpx_supported()) + kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && @@ -8722,6 +8724,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; + if (kvm_mpx_supported()) + kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return 0; } From 95145c25a78cc0a9d3cbc75708abde432310c5a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 9 Dec 2019 12:05:17 -0800 Subject: [PATCH 022/204] KVM: x86: Add a WARN on TIF_NEED_FPU_LOAD in kvm_load_guest_fpu() WARN once in kvm_load_guest_fpu() if TIF_NEED_FPU_LOAD is observed, as that would mean that KVM is corrupting userspace's FPU by saving unknown register state into arch.user_fpu. Add a comment to explain why KVM WARNs on TIF_NEED_FPU_LOAD instead of implementing logic similar to fpu__copy(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0af5cb637beaa..25aac4c81b124 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8476,6 +8476,13 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); + /* + * Reloading userspace's FPU is handled by kvm_arch_vcpu_load(), both + * for direct calls from userspace (via vcpu_load()) and if this task + * is preempted (via kvm_sched_in()) between vcpu_load() and now. + */ + WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); + copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, From 736c291c9f36b07f8889c61764c28edce20e715d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:14 -0800 Subject: [PATCH 023/204] KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM Convert a plethora of parameters and variables in the MMU and page fault flows from type gva_t to gpa_t to properly handle TDP on 32-bit KVM. Thanks to PSE and PAE paging, 32-bit kernels can access 64-bit physical addresses. When TDP is enabled, the fault address is a guest physical address and thus can be a 64-bit value, even when both KVM and its guest are using 32-bit virtual addressing, e.g. VMX's VMCS.GUEST_PHYSICAL is a 64-bit field, not a natural width field. Using a gva_t for the fault address means KVM will incorrectly drop the upper 32-bits of the GPA. Ditto for gva_to_gpa() when it is used to translate L2 GPAs to L1 GPAs. Opportunistically rename variables and parameters to better reflect the dual address modes, e.g. use "cr2_or_gpa" for fault addresses and plain "addr" instead of "vaddr" when the address may be either a GVA or an L2 GPA. Similarly, use "gpa" in the nonpaging_page_fault() flows to avoid a confusing "gpa_t gva" declaration; this also sets the stage for a future patch to combing nonpaging_page_fault() and tdp_page_fault() with minimal churn. Sprinkle in a few comments to document flows where an address is known to be a GVA and thus can be safely truncated to a 32-bit value. Add WARNs in kvm_handle_page_fault() and FNAME(gva_to_gpa_nested)() to help document such cases and detect bugs. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 ++-- arch/x86/kvm/mmu/mmu.c | 69 +++++++++++++++++++-------------- arch/x86/kvm/mmu/paging_tmpl.h | 25 +++++++----- arch/x86/kvm/mmutrace.h | 12 +++--- arch/x86/kvm/x86.c | 40 +++++++++---------- arch/x86/kvm/x86.h | 2 +- include/linux/kvm_host.h | 6 +-- virt/kvm/async_pf.c | 10 ++--- 8 files changed, 94 insertions(+), 78 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2893eae5df9f4..159a28512e4c3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -378,12 +378,12 @@ struct kvm_mmu { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, + int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, - struct x86_exception *exception); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, + u32 access, struct x86_exception *exception); gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -1473,7 +1473,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c19f3ccaace31..2cb199817837b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3532,7 +3532,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) * - true: let the vcpu to access on the same address again. * - false: let the real page fault path to fix it. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, +static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, u32 error_code) { struct kvm_shadow_walk_iterator iterator; @@ -3552,7 +3552,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) if (!is_shadow_present_pte(spte) || iterator.level < level) break; @@ -3630,7 +3630,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, } while (true); - trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, spte, fault_handled); walk_shadow_page_lockless_end(vcpu); @@ -3638,10 +3638,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable); static int make_mmu_pages_available(struct kvm_vcpu *vcpu); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, +static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn, bool prefault) { int r; @@ -3667,16 +3668,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); } - if (fast_page_fault(vcpu, v, level, error_code)) + if (fast_page_fault(vcpu, gpa, level, error_code)) return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) return RET_PF_RETRY; - if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -3687,7 +3688,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -3985,7 +3986,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { if (exception) @@ -3993,7 +3994,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, return vaddr; } -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -4153,13 +4154,14 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - gfn_t gfn = gva >> PAGE_SHIFT; + gfn_t gfn = gpa >> PAGE_SHIFT; int r; - pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + /* Note, paging is disabled, ergo gva == gpa. */ + pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4171,11 +4173,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - return nonpaging_map(vcpu, gva & PAGE_MASK, + return nonpaging_map(vcpu, gpa & PAGE_MASK, error_code, gfn, prefault); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4184,11 +4187,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu->direct_map; arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, cr2_or_gpa, + kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable) { struct kvm_memory_slot *slot; bool async; @@ -4208,12 +4213,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; /* *pfn has correct page already */ if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); + trace_kvm_try_async_get_page(cr2_or_gpa, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); + trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) return true; } @@ -4226,6 +4231,12 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, { int r = 1; +#ifndef CONFIG_X86_64 + /* A 64-bit CR2 should be impossible on 32-bit KVM. */ + if (WARN_ON_ONCE(fault_address >> 32)) + return -EFAULT; +#endif + vcpu->arch.l1tf_flush_l1d = true; switch (vcpu->arch.apf.host_apf_reason) { default: @@ -4263,7 +4274,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); } -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, +static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { kvm_pfn_t pfn; @@ -5520,7 +5531,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) return 0; } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = 0; @@ -5529,18 +5540,18 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, /* With shadow page tables, fault_address contains a GVA or nGPA. */ if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; - vcpu->arch.gpa_val = cr2; + vcpu->arch.gpa_val = cr2_or_gpa; } r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, cr2, direct); + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu->page_fault(vcpu, cr2, + r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); WARN_ON(r == RET_PF_INVALID); @@ -5560,7 +5571,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, */ if (vcpu->arch.mmu->direct_map && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } @@ -5575,7 +5586,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * explicitly shadowing L1's page tables, i.e. unprotecting something * for L1 isn't going to magically fix whatever issue cause L2 to fail. */ - if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* @@ -5590,7 +5601,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; } - return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, + return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 97b21e7fd013d..c1d7b866a03fa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -291,11 +291,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) } /* - * Fetch a guest pte for a guest virtual address + * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t addr, u32 access) + gpa_t addr, u32 access) { int ret; pt_element_t pte; @@ -496,7 +496,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, u32 access) + struct kvm_vcpu *vcpu, gpa_t addr, u32 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); @@ -611,7 +611,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, int write_fault, int hlevel, kvm_pfn_t pfn, bool map_writable, bool prefault, @@ -765,7 +765,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { int write_fault = error_code & PFERR_WRITE_MASK; @@ -945,18 +945,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, +/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr)(&walker, vcpu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; @@ -964,7 +965,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, } #if PTTYPE != PTTYPE_EPT -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, +/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -972,6 +974,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, gpa_t gpa = UNMAPPED_GVA; int r; +#ifndef CONFIG_X86_64 + /* A 64-bit GVA should be impossible on 32-bit KVM. */ + WARN_ON_ONCE(vaddr >> 32); +#endif + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); if (r) { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 7ca8831c7d1a2..3c6522b84ff11 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -249,13 +249,13 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), TP_STRUCT__entry( __field(int, vcpu_id) - __field(gva_t, gva) + __field(gpa_t, cr2_or_gpa) __field(u32, error_code) __field(u64 *, sptep) __field(u64, old_spte) @@ -265,7 +265,7 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->gva = gva; + __entry->cr2_or_gpa = cr2_or_gpa; __entry->error_code = error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; @@ -273,9 +273,9 @@ TRACE_EVENT( __entry->retry = retry; ), - TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" " new %llx spurious %d fixed %d", __entry->vcpu_id, - __entry->gva, __print_flags(__entry->error_code, "|", + __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, __spte_satisfied(old_spte), __spte_satisfied(new_spte) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25aac4c81b124..93bbbce67a03e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6379,11 +6379,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool write_fault_to_shadow_pgtable, int emulation_type) { - gpa_t gpa = cr2; + gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) @@ -6397,7 +6397,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, * Write permission should be allowed since only * write access need to be emulated. */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); /* * If the mapping is invalid in guest, let cpu retry @@ -6454,10 +6454,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) + gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; @@ -6486,14 +6486,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) return false; vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2; + vcpu->arch.last_retry_addr = cr2_or_gpa; if (!vcpu->arch.mmu->direct_map) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6639,11 +6639,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + int emulation_type, void *insn, int insn_len) { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -6689,8 +6686,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, - emulation_type)) + if (reexecute_instruction(vcpu, cr2_or_gpa, + write_fault_to_spt, + emulation_type)) return 1; if (ctxt->have_exception) { /* @@ -6724,7 +6722,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return 1; } - if (retry_instruction(ctxt, cr2, emulation_type)) + if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) return 1; /* this is needed for vmware backdoor interface to work since it @@ -6736,7 +6734,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, restart: /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2; + ctxt->exception.address = cr2_or_gpa; r = x86_emulate_insn(ctxt); @@ -6744,7 +6742,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; @@ -10025,7 +10023,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); + vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) @@ -10138,7 +10136,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_not_present(work->arch.token, work->gva); + trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && @@ -10173,7 +10171,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - trace_kvm_async_pf_ready(work->arch.token, work->gva); + trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && !apf_get_user(vcpu, &val)) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 29391af8871d1..cab5e71f0f0fd 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -289,7 +289,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0d632a75fce95..528ab7a814ab4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -204,7 +204,7 @@ struct kvm_async_pf { struct list_head queue; struct kvm_vcpu *vcpu; struct mm_struct *mm; - gva_t gva; + gpa_t cr2_or_gpa; unsigned long addr; struct kvm_arch_async_pf arch; bool wakeup_all; @@ -212,8 +212,8 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch); +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 35305d6e68cc6..d8ef708a2ef67 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -64,7 +64,7 @@ static void async_pf_execute(struct work_struct *work) struct mm_struct *mm = apf->mm; struct kvm_vcpu *vcpu = apf->vcpu; unsigned long addr = apf->addr; - gva_t gva = apf->gva; + gpa_t cr2_or_gpa = apf->cr2_or_gpa; int locked = 1; might_sleep(); @@ -92,7 +92,7 @@ static void async_pf_execute(struct work_struct *work) * this point */ - trace_kvm_async_pf_completed(addr, gva); + trace_kvm_async_pf_completed(addr, cr2_or_gpa); if (swq_has_sleeper(&vcpu->wq)) swake_up_one(&vcpu->wq); @@ -165,8 +165,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch) +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; @@ -185,7 +185,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, work->wakeup_all = false; work->vcpu = vcpu; - work->gva = gva; + work->cr2_or_gpa = cr2_or_gpa; work->addr = hva; work->arch = *arch; work->mm = current->mm; From ba7888dde6afc32885a0960f11b898ff97d4a060 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:15 -0800 Subject: [PATCH 024/204] KVM: x86/mmu: Move definition of make_mmu_pages_available() up Move make_mmu_pages_available() above its first user to put it closer to related code and eliminate a forward declaration. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2cb199817837b..638618c384d0b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2903,6 +2903,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } +static int make_mmu_pages_available(struct kvm_vcpu *vcpu) +{ + LIST_HEAD(invalid_list); + + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + return 0; + + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) + break; + + ++vcpu->kvm->stat.mmu_recycled; + } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + if (!kvm_mmu_available_pages(vcpu->kvm)) + return -ENOSPC; + return 0; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock @@ -3640,7 +3660,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable); -static int make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn, bool prefault) @@ -5511,26 +5530,6 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); -static int make_mmu_pages_available(struct kvm_vcpu *vcpu) -{ - LIST_HEAD(invalid_list); - - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) - return 0; - - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - - if (!kvm_mmu_available_pages(vcpu->kvm)) - return -ENOSPC; - return 0; -} - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { From 367fd790b17dbe2847c29099cf9902ded207901c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:16 -0800 Subject: [PATCH 025/204] KVM: x86/mmu: Fold nonpaging_map() into nonpaging_page_fault() Fold nonpaging_map() into its sole caller, nonpaging_page_fault(), in preparation for combining the bulk of nonpaging_page_fault() and tdp_page_fault() into a common helper. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 106 +++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 638618c384d0b..a1d1fc21aa8d3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3661,60 +3661,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - gfn_t gfn, bool prefault) -{ - int r; - int level; - bool force_pt_level; - kvm_pfn_t pfn; - unsigned long mmu_seq; - bool map_writable, write = error_code & PFERR_WRITE_MASK; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); - - force_pt_level = lpage_disallowed; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - - if (fast_page_fault(vcpu, gpa, level, error_code)) - return RET_PF_RETRY; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return RET_PF_RETRY; - - if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) - return r; - - r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) - goto out_unlock; - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, false); -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return r; -} - static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { @@ -4176,12 +4122,21 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - gfn_t gfn = gpa >> PAGE_SHIFT; int r; + int level; + kvm_pfn_t pfn; + unsigned long mmu_seq; + gfn_t gfn = gpa >> PAGE_SHIFT; + bool write = error_code & PFERR_WRITE_MASK; + bool force_pt_level, map_writable; + bool exec = error_code & PFERR_FETCH_MASK; + bool lpage_disallowed = exec && is_nx_huge_page_enabled(); /* Note, paging is disabled, ergo gva == gpa. */ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + gpa &= PAGE_MASK; + if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4191,9 +4146,46 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); + force_pt_level = lpage_disallowed; + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + + if (fast_page_fault(vcpu, gpa, level, error_code)) + return RET_PF_RETRY; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + return RET_PF_RETRY; + + if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) + return r; - return nonpaging_map(vcpu, gpa & PAGE_MASK, - error_code, gfn, prefault); + r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, false); +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return r; } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, From 9f1a8526fbe3e82afae565fd008f2ca2035662e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:17 -0800 Subject: [PATCH 026/204] KVM: x86/mmu: Move nonpaging_page_fault() below try_async_pf() Move nonpaging_page_fault() below try_async_pf() to eliminate the forward declaration of try_async_pf() and to prepare for combining the bulk of nonpaging_page_fault() and tdp_page_fault() into a common helper. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 102 ++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a1d1fc21aa8d3..b3633067143e0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3657,10 +3657,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, return fault_handled; } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable); - static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { @@ -4119,6 +4115,55 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) +{ + struct kvm_arch_async_pf arch; + + arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; + arch.gfn = gfn; + arch.direct_map = vcpu->arch.mmu->direct_map; + arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); + + return kvm_setup_async_pf(vcpu, cr2_or_gpa, + kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); +} + +static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable) +{ + struct kvm_memory_slot *slot; + bool async; + + /* + * Don't expose private memslots to L2. + */ + if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + *pfn = KVM_PFN_NOSLOT; + return false; + } + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + async = false; + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + if (!async) + return false; /* *pfn has correct page already */ + + if (!prefault && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(cr2_or_gpa, gfn); + if (kvm_find_async_pf_gfn(vcpu, gfn)) { + trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return true; + } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + return true; + } + + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + return false; +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { @@ -4188,55 +4233,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, return r; } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) -{ - struct kvm_arch_async_pf arch; - - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; - arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu->direct_map; - arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); - - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) -{ - struct kvm_memory_slot *slot; - bool async; - - /* - * Don't expose private memslots to L2. - */ - if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { - *pfn = KVM_PFN_NOSLOT; - return false; - } - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); - if (!async) - return false; /* *pfn has correct page already */ - - if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; - } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; - } - - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); - return false; -} - int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { From cb9b88c669396e3e5f957fe909ff901b51321013 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:18 -0800 Subject: [PATCH 027/204] KVM: x86/mmu: Refactor handling of cache consistency with TDP Pre-calculate the max level for a TDP page with respect to MTRR cache consistency in preparation of replacing force_pt_level with max_level, and eventually combining the bulk of nonpaging_page_fault() and tdp_page_fault() into a common helper. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b3633067143e0..defe94ecd0a45 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4271,16 +4271,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -static bool -check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) -{ - int page_num = KVM_PAGES_PER_HPAGE(level); - - gfn &= ~(page_num - 1); - - return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); -} - static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { @@ -4294,6 +4284,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool map_writable; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && is_nx_huge_page_enabled(); + int max_level; MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4304,14 +4295,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) return r; - force_pt_level = - lpage_disallowed || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); + for (max_level = PT_MAX_HUGEPAGE_LEVEL; + max_level > PT_PAGE_TABLE_LEVEL; + max_level--) { + int page_num = KVM_PAGES_PER_HPAGE(max_level); + gfn_t base = gfn & ~(page_num - 1); + + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } + + force_pt_level = lpage_disallowed || max_level == PT_PAGE_TABLE_LEVEL; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - if (level > PT_DIRECTORY_LEVEL && - !check_hugepage_cache_consistency(vcpu, gfn, level)) - level = PT_DIRECTORY_LEVEL; + if (level > max_level) + level = max_level; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); } From f0f37e229c0517fa0d8bda73a2aeee28260370a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:19 -0800 Subject: [PATCH 028/204] KVM: x86/mmu: Refactor the per-slot level calculation in mapping_level() Invert the loop which adjusts the allowed page level based on what's compatible with the associated memslot to use a largest-to-smallest page size walk. This paves the way for passing around a "max level" variable instead of having redundant checks and/or multiple booleans. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index defe94ecd0a45..8db2bb0508098 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1330,7 +1330,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { - int host_level, level, max_level; + int host_level, max_level; struct kvm_memory_slot *slot; if (unlikely(*force_pt_level)) @@ -1347,12 +1347,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, return host_level; max_level = min(kvm_x86_ops->get_lpage_level(), host_level); - - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) + for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot)) break; + } - return level - 1; + return max_level; } /* From 39ca1ecb784b29965fd780bed1e8a3792a086a29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:20 -0800 Subject: [PATCH 029/204] KVM: x86/mmu: Refactor handling of forced 4k pages in page faults Refactor the page fault handlers and mapping_level() to track the max allowed page level instead of only tracking if a 4k page is mandatory due to one restriction or another. This paves the way for cleanly consolidating tdp_page_fault() and nonpaging_page_fault(), and for eliminating a redundant check on mmu_gfn_lpage_is_disallowed(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 45 ++++++++++++++-------------------- arch/x86/kvm/mmu/paging_tmpl.h | 16 +++++++----- 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8db2bb0508098..daf41806243f4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1328,18 +1328,19 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, - bool *force_pt_level) + int *max_levelp) { - int host_level, max_level; + int host_level, max_level = *max_levelp; struct kvm_memory_slot *slot; - if (unlikely(*force_pt_level)) + if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - *force_pt_level = !memslot_valid_for_gpte(slot, true); - if (unlikely(*force_pt_level)) + if (!memslot_valid_for_gpte(slot, true)) { + *max_levelp = PT_PAGE_TABLE_LEVEL; return PT_PAGE_TABLE_LEVEL; + } host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -4173,9 +4174,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long mmu_seq; gfn_t gfn = gpa >> PAGE_SHIFT; bool write = error_code & PFERR_WRITE_MASK; - bool force_pt_level, map_writable; + bool map_writable; bool exec = error_code & PFERR_FETCH_MASK; bool lpage_disallowed = exec && is_nx_huge_page_enabled(); + int max_level; /* Note, paging is disabled, ergo gva == gpa. */ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); @@ -4191,19 +4193,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - force_pt_level = lpage_disallowed; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ + max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : PT_DIRECTORY_LEVEL; + level = mapping_level(vcpu, gfn, &max_level); + if (level > PT_PAGE_TABLE_LEVEL) gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } if (fast_page_fault(vcpu, gpa, level, error_code)) return RET_PF_RETRY; @@ -4223,7 +4218,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (likely(!force_pt_level)) + if (likely(max_level > PT_PAGE_TABLE_LEVEL)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, false); @@ -4277,7 +4272,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, kvm_pfn_t pfn; int r; int level; - bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -4305,13 +4299,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; } - force_pt_level = lpage_disallowed || max_level == PT_PAGE_TABLE_LEVEL; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - if (level > max_level) - level = max_level; + if (lpage_disallowed) + max_level = PT_PAGE_TABLE_LEVEL; + + level = mapping_level(vcpu, gfn, &max_level); + if (level > PT_PAGE_TABLE_LEVEL) gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } if (fast_page_fault(vcpu, gpa, level, error_code)) return RET_PF_RETRY; @@ -4331,7 +4324,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (likely(!force_pt_level)) + if (likely(max_level > PT_PAGE_TABLE_LEVEL)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, lpage_disallowed); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c1d7b866a03fa..1938a6e4e6310 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -778,7 +778,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && is_nx_huge_page_enabled(); - bool force_pt_level = lpage_disallowed; + int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -818,14 +818,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : + PT_MAX_HUGEPAGE_LEVEL; + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &force_pt_level); - if (likely(!force_pt_level)) { + level = mapping_level(vcpu, walker.gfn, &max_level); + if (likely(max_level > PT_DIRECTORY_LEVEL)) { level = min(walker.level, level); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } - } else - force_pt_level = true; + } else { + max_level = PT_PAGE_TABLE_LEVEL; + } mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -865,7 +869,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (!force_pt_level) + if (max_level > PT_PAGE_TABLE_LEVEL) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault, lpage_disallowed); From cbe1e6f035523b5fd29e44e18b82081b33d1f3f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:21 -0800 Subject: [PATCH 030/204] KVM: x86/mmu: Incorporate guest's page level into max level for shadow MMU Restrict the max level for a shadow page based on the guest's level instead of capping the level after the fact for host-mapped huge pages, e.g. hugetlbfs pages. Explicitly capping the max level using the guest mapping level also eliminates FNAME(page_fault)'s subtle dependency on THP only supporting 2mb pages. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1938a6e4e6310..7d57ec576df0c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -773,7 +773,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; - int level = PT_PAGE_TABLE_LEVEL; + int level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && @@ -818,18 +818,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : - PT_MAX_HUGEPAGE_LEVEL; - - if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &max_level); - if (likely(max_level > PT_DIRECTORY_LEVEL)) { - level = min(walker.level, level); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - } else { + if (lpage_disallowed || is_self_change_mapping) max_level = PT_PAGE_TABLE_LEVEL; - } + else + max_level = walker.level; + + level = mapping_level(vcpu, walker.gfn, &max_level); + if (level > PT_PAGE_TABLE_LEVEL) + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); From 2f57b7051fe8fa680b7c38c7e98094fa3ba3ba8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:22 -0800 Subject: [PATCH 031/204] KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level Persist the max page level calculated via gfn_lpage_is_disallowed() to the max level "returned" by mapping_level() so that its naturally taken into account by the max level check that conditions calling transparent_hugepage_adjust(). Drop the gfn_lpage_is_disallowed() check in thp_adjust() as it's now handled by mapping_level() and its callers. Add a comment to document the behavior of host_mapping_level() and its interaction with max level and transparent huge pages. Note, transferring the gfn_lpage_is_disallowed() from thp_adjust() to mapping_level() superficially affects how changes to a memslot's disallow_lpage count will be handled due to thp_adjust() being run while holding mmu_lock. In the more common case where a different vCPU increments the count via account_shadowed(), gfn_lpage_is_disallowed() is rechecked by set_spte() to ensure a writable large page isn't created. In the less common case where the count is decremented to zero due to all shadow pages in the memslot being zapped, THP behavior now matches hugetlbfs behavior in the sense that a small page will be created when a large page could be used if the count reaches zero in the miniscule window between mapping_level() and acquiring mmu_lock. Lastly, the new THP behavior also follows hugetlbfs behavior in the absurdly unlikely scenario of a memslot being moved such that the memslot's compatibility with respect to large pages changes, but without changing the validity of the gpf->pfn walk. I.e. if a memslot is moved between mapping_level() and snapshotting mmu_seq, it's theoretically possible to consume a stale disallow_lpage count. But, since KVM zaps all shadow pages when moving a memslot and forces all vCPUs to reload a new MMU, the inserted spte will always be thrown away prior to completing the memslot move, i.e. whether or not the spte accurately reflects disallow_lpage is irrelevant. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index daf41806243f4..7cfacfe8548e9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1330,7 +1330,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int *max_levelp) { - int host_level, max_level = *max_levelp; + int max_level = *max_levelp; struct kvm_memory_slot *slot; if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) @@ -1342,18 +1342,27 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, return PT_PAGE_TABLE_LEVEL; } - host_level = host_mapping_level(vcpu->kvm, large_gfn); - - if (host_level == PT_PAGE_TABLE_LEVEL) - return host_level; - - max_level = min(kvm_x86_ops->get_lpage_level(), host_level); + max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot)) break; } - return max_level; + *max_levelp = max_level; + + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; + + /* + * Note, host_mapping_level() does *not* handle transparent huge pages. + * As suggested by "mapping", it reflects the page size established by + * the associated vma, if there is one, i.e. host_mapping_level() will + * return a huge page level if and only if a vma exists and the backing + * implementation for the vma uses huge pages, e.g. hugetlbfs and dax. + * So, do not propagate host_mapping_level() to max_level as KVM can + * still promote the guest mapping to a huge page in the THP case. + */ + return host_mapping_level(vcpu->kvm, large_gfn); } /* @@ -3424,8 +3433,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn)) && - !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { + PageTransCompoundMap(pfn_to_page(pfn))) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the From 2cb70fd441b6b40ffd9ee1782a972f149ba72158 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:23 -0800 Subject: [PATCH 032/204] KVM: x86/mmu: Rename lpage_disallowed to account_disallowed_nx_lpage Rename __direct_map()'s param that controls whether or not a disallowed NX large page should be accounted to match what it actually does. The nonpaging_page_fault() case unconditionally passes %false for the param even though it locally sets lpage_disallowed. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7cfacfe8548e9..535598578647c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3353,7 +3353,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault, bool lpage_disallowed) + bool prefault, bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3382,7 +3382,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (account_disallowed_nx_lpage) account_huge_nx_page(vcpu->kvm, sp); } } From 0f90e1c10dca5149529b4af6d94d7804ac2ef37b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:24 -0800 Subject: [PATCH 033/204] KVM: x86/mmu: Consolidate tdp_page_fault() and nonpaging_page_fault() Consolidate the direct MMU page fault handlers into a common helper, direct_page_fault(). Except for unique max level conditions, the tdp and nonpaging fault handlers are functionally identical. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 98 ++++++++++++------------------------------ 1 file changed, 27 insertions(+), 71 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 535598578647c..313f77a342624 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4173,24 +4173,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, - u32 error_code, bool prefault) +static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault, int max_level, bool is_tdp) { - int r; - int level; - kvm_pfn_t pfn; - unsigned long mmu_seq; - gfn_t gfn = gpa >> PAGE_SHIFT; bool write = error_code & PFERR_WRITE_MASK; - bool map_writable; bool exec = error_code & PFERR_FETCH_MASK; bool lpage_disallowed = exec && is_nx_huge_page_enabled(); - int max_level; + bool map_writable; - /* Note, paging is disabled, ergo gva == gpa. */ - pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; + kvm_pfn_t pfn; + int level, r; - gpa &= PAGE_MASK; + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4199,10 +4195,8 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, if (r) return r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - - /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ - max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : PT_DIRECTORY_LEVEL; + if (lpage_disallowed) + max_level = PT_PAGE_TABLE_LEVEL; level = mapping_level(vcpu, gfn, &max_level); if (level > PT_PAGE_TABLE_LEVEL) @@ -4217,7 +4211,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) return RET_PF_RETRY; - if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -4228,14 +4222,25 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, goto out_unlock; if (likely(max_level > PT_PAGE_TABLE_LEVEL)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, false); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, + is_tdp && lpage_disallowed); + out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, + u32 error_code, bool prefault) +{ + pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + + /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ + return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, + PT_DIRECTORY_LEVEL, false); +} + int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { @@ -4277,69 +4282,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - kvm_pfn_t pfn; - int r; - int level; - gfn_t gfn = gpa >> PAGE_SHIFT; - unsigned long mmu_seq; - int write = error_code & PFERR_WRITE_MASK; - bool map_writable; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - - if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return RET_PF_EMULATE; - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - for (max_level = PT_MAX_HUGEPAGE_LEVEL; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { int page_num = KVM_PAGES_PER_HPAGE(max_level); - gfn_t base = gfn & ~(page_num - 1); + gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; } - if (lpage_disallowed) - max_level = PT_PAGE_TABLE_LEVEL; - - level = mapping_level(vcpu, gfn, &max_level); - if (level > PT_PAGE_TABLE_LEVEL) - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - - if (fast_page_fault(vcpu, gpa, level, error_code)) - return RET_PF_RETRY; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return RET_PF_RETRY; - - if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) - return r; - - r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) - goto out_unlock; - if (likely(max_level > PT_PAGE_TABLE_LEVEL)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, lpage_disallowed); -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return r; + return direct_page_fault(vcpu, gpa, error_code, prefault, + max_level, true); } static void nonpaging_init_context(struct kvm_vcpu *vcpu, From 0885904d4ff7e2d926caf743537ddd411ff22bfa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:25 -0800 Subject: [PATCH 034/204] KVM: x86/mmu: Move transparent_hugepage_adjust() above __direct_map() Move thp_adjust() above __direct_map() in preparation of calling thp_adjust() from __direct_map() and FNAME(fetch). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 76 +++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 313f77a342624..f8da3965c3e98 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3328,6 +3328,44 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t gfn, kvm_pfn_t *pfnp, + int *levelp) +{ + kvm_pfn_t pfn = *pfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && + PageTransCompoundMap(pfn_to_page(pfn))) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); + *pfnp = pfn; + } + } +} + static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { @@ -3418,44 +3456,6 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t gfn, kvm_pfn_t *pfnp, - int *levelp) -{ - kvm_pfn_t pfn = *pfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn))) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - } -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { From 4cd071d13c5cc671826571d9876f76d001937a8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:26 -0800 Subject: [PATCH 035/204] KVM: x86/mmu: Move calls to thp_adjust() down a level Move the calls to thp_adjust() down a level from the page fault handlers to the map/fetch helpers and remove the page count shuffling done in thp_adjust(). Despite holding a reference to the underlying page while processing a page fault, the page fault flows don't actually rely on holding a reference to the page when thp_adjust() is called. At that point, the fault handlers hold mmu_lock, which prevents mmu_notifier from completing any invalidations, and have verified no invalidations from mmu_notifier have occurred since the page reference was acquired (which is done prior to taking mmu_lock). The kvm_release_pfn_clean()/kvm_get_pfn() dance in thp_adjust() is a quirk that is necessitated because thp_adjust() modifies the pfn that is consumed by its caller. Because the page fault handlers call kvm_release_pfn_clean() on said pfn, thp_adjust() needs to transfer the reference to the correct pfn purely for correctness when the pfn is released. Calling thp_adjust() from __direct_map() and FNAME(fetch) means the pfn adjustment doesn't change the pfn as seen by the page fault handlers, i.e. the pfn released by the page fault handlers is the same pfn that was returned by gfn_to_pfn(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++------------------- arch/x86/kvm/mmu/paging_tmpl.h | 11 ++++++----- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f8da3965c3e98..c4ed746416bab 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3345,24 +3345,15 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn))) { unsigned long mask; + /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. + * mmu_notifier_retry() was successful and mmu_lock is held, so + * the pmd can't be split from under us. */ *levelp = level = PT_DIRECTORY_LEVEL; mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } + *pfnp = pfn & ~mask; } } @@ -3390,8 +3381,9 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, - int map_writable, int level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + int map_writable, int level, int max_level, + kvm_pfn_t pfn, bool prefault, + bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3402,6 +3394,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return RET_PF_RETRY; + if (likely(max_level > PT_PAGE_TABLE_LEVEL)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { /* @@ -4220,10 +4215,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (likely(max_level > PT_PAGE_TABLE_LEVEL)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, - is_tdp && lpage_disallowed); + r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn, + prefault, is_tdp && lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d57ec576df0c..3b0ba2a77e28d 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -613,7 +613,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, - int write_fault, int hlevel, + int write_fault, int hlevel, int max_level, kvm_pfn_t pfn, bool map_writable, bool prefault, bool lpage_disallowed) { @@ -673,6 +673,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); base_gfn = gfn; + if (max_level > PT_PAGE_TABLE_LEVEL) + transparent_hugepage_adjust(vcpu, gw->gfn, &pfn, &hlevel); + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { @@ -865,10 +868,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (max_level > PT_PAGE_TABLE_LEVEL) - transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, max_level, + pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: From ddce6208217c1aac22eec74461afb73e2af1fb06 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:27 -0800 Subject: [PATCH 036/204] KVM: x86/mmu: Move root_hpa validity checks to top of page fault handler Add a check on root_hpa at the beginning of the page fault handler to consolidate several checks on root_hpa that are scattered throughout the page fault code. This is a preparatory step towards eventually removing such checks altogether, or at the very least WARNing if an invalid root is encountered. Remove only the checks that can be easily audited to confirm that root_hpa cannot be invalidated between their current location and the new check in kvm_mmu_page_fault(), and aren't currently protected by mmu_lock, i.e. keep the checks in __direct_map() and FNAME(fetch) for the time being. The root_hpa checks that are consolidate were all added by commit 37f6a4e237303 ("KVM: x86: handle invalid root_hpa everywhere") which was a follow up to a bug fix for __direct_map(), commit 989c6b34f6a94 ("KVM: MMU: handle invalid root_hpa at __direct_map") At the time, nested VMX had, in hindsight, crazy handling of nested interrupts and would trigger a nested VM-Exit in ->interrupt_allowed(), and thus unexpectedly reset the MMU in flows such as can_do_async_pf(). Now that the wonky nested VM-Exit behavior is gone, the root_hpa checks are bogus and confusing, e.g. it's not at all obvious what they actually protect against, and at first glance they appear to be broken since many of them run without holding mmu_lock. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c4ed746416bab..bdba15ef88aad 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3565,9 +3565,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, u64 spte = 0ull; uint retry_count = 0; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return false; - if (!page_fault_can_be_fast(error_code)) return false; @@ -4011,9 +4008,6 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - goto exit; - walk_shadow_page_lockless_begin(vcpu); for (shadow_walk_init(&iterator, vcpu, addr), @@ -4043,7 +4037,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) root--; } } -exit: + *sptep = spte; return reserved; } @@ -4107,9 +4101,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) struct kvm_shadow_walk_iterator iterator; u64 spte; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { clear_sp_write_flooding_count(iterator.sptep); @@ -5472,6 +5463,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int r, emulation_type = 0; bool direct = vcpu->arch.mmu->direct_map; + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + return RET_PF_RETRY; + /* With shadow page tables, fault_address contains a GVA or nGPA. */ if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; From 0c7a98e34ddae6e45938f02d4ce7f04114ef0bdc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:28 -0800 Subject: [PATCH 037/204] KVM: x86/mmu: WARN on an invalid root_hpa WARN on the existing invalid root_hpa checks in __direct_map() and FNAME(fetch). The "legitimate" path that invalidated root_hpa in the middle of a page fault is long since gone, i.e. it should no longer be impossible to invalidate in the middle of a page fault[*]. The root_hpa checks were added by two related commits 989c6b34f6a94 ("KVM: MMU: handle invalid root_hpa at __direct_map") 37f6a4e237303 ("KVM: x86: handle invalid root_hpa everywhere") to fix a bug where nested_vmx_vmexit() could be called *in the middle* of a page fault. At the time, vmx_interrupt_allowed(), which was and still is used by kvm_can_do_async_pf() via ->interrupt_allowed(), directly invoked nested_vmx_vmexit() to switch from L2 to L1 to emulate a VM-Exit on a pending interrupt. Emulating the nested VM-Exit resulted in root_hpa being invalidated by kvm_mmu_reset_context() without explicitly terminating the page fault. Now that root_hpa is checked for validity by kvm_mmu_page_fault(), WARN on an invalid root_hpa to detect any flows that reset the MMU while handling a page fault. The broken vmx_interrupt_allowed() behavior has long since been fixed and resetting the MMU during a page fault should not be considered legal behavior. [*] It's actually technically possible in FNAME(page_fault)() because it calls inject_page_fault() when the guest translation is invalid, but in that case the page fault handling is immediately terminated. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bdba15ef88aad..51f81f10f9f7a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3391,7 +3391,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; if (likely(max_level > PT_PAGE_TABLE_LEVEL)) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 3b0ba2a77e28d..b53bed3c901cb 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -637,7 +637,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; for (shadow_walk_init(&it, vcpu, addr); From 6948199a9af969342a6faf257678616aea491fcf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:29 -0800 Subject: [PATCH 038/204] KVM: x86/mmu: WARN if root_hpa is invalid when handling a page fault WARN if root_hpa is invalid when handling a page fault. The check on root_hpa exists for historical reasons that no longer apply to the current KVM code base. Remove an equivalent debug-only warning in direct_page_fault(), whose existence more or less confirms that root_hpa should always be valid when handling a page fault. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 51f81f10f9f7a..7269130ea5e28 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4172,8 +4172,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, kvm_pfn_t pfn; int level, r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -5463,7 +5461,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int r, emulation_type = 0; bool direct = vcpu->arch.mmu->direct_map; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; /* With shadow page tables, fault_address contains a GVA or nGPA. */ From 4de0a8355463e068e443b48eb5ae32370155368b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 19 Nov 2019 14:27:40 +0800 Subject: [PATCH 039/204] KVM: PPC: Remove set but not used variable 'ra', 'rs', 'rt' Fixes gcc '-Wunused-but-set-variable' warning: arch/powerpc/kvm/emulate_loadstore.c: In function kvmppc_emulate_loadstore: arch/powerpc/kvm/emulate_loadstore.c:87:6: warning: variable ra set but not used [-Wunused-but-set-variable] arch/powerpc/kvm/emulate_loadstore.c: In function kvmppc_emulate_loadstore: arch/powerpc/kvm/emulate_loadstore.c:87:10: warning: variable rs set but not used [-Wunused-but-set-variable] arch/powerpc/kvm/emulate_loadstore.c: In function kvmppc_emulate_loadstore: arch/powerpc/kvm/emulate_loadstore.c:87:14: warning: variable rt set but not used [-Wunused-but-set-variable] They are not used since commit 2b33cb585f94 ("KVM: PPC: Reimplement LOAD_FP/STORE_FP instruction mmio emulation with analyse_instr() input") Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/emulate_loadstore.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 2e496eb86e94a..1139bc56e0045 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -73,7 +73,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 inst; - int ra, rs, rt; enum emulation_result emulated = EMULATE_FAIL; int advance = 1; struct instruction_op op; @@ -85,10 +84,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) if (emulated != EMULATE_DONE) return emulated; - ra = get_ra(inst); - rs = get_rs(inst); - rt = get_rt(inst); - vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; From 8a9c8925149f195d0bbd6b42aa3130ced0a075fb Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 26 Nov 2019 19:36:30 -0300 Subject: [PATCH 040/204] KVM: PPC: Book3S: Replace current->mm by kvm->mm Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c | 10 ++++++---- arch/powerpc/kvm/book3s_hv.c | 10 +++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d381526c5c9be..6c372f5c61b64 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -284,7 +284,7 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, - current->mm->pgd, false, pte_idx_ret); + kvm->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ @@ -573,7 +573,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = false; pfn = 0; page = NULL; - mm = current->mm; + mm = kvm->mm; pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 883a66e76638a..ee6c103bb7d50 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -253,10 +253,11 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) } } + account_locked_vm(kvm->mm, + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); + kvm_put_kvm(stt->kvm); - account_locked_vm(current->mm, - kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -272,6 +273,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, { struct kvmppc_spapr_tce_table *stt = NULL; struct kvmppc_spapr_tce_table *siter; + struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; int ret = -ENOMEM; @@ -280,7 +282,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EINVAL; npages = kvmppc_tce_pages(size); - ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true); + ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true); if (ret) return ret; @@ -326,7 +328,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kfree(stt); fail_acct: - account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); + account_locked_vm(mm, kvmppc_stt_pages(npages), false); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ff3f896d9081..2cf3dd8b79d25 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4285,7 +4285,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) user_vrsave = mfspr(SPRN_VRSAVE); vcpu->arch.wqp = &vcpu->arch.vcore->wq; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { @@ -4640,14 +4640,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); + down_read(&kvm->mm->mmap_sem); + vma = find_vma(kvm->mm, hva); if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) goto up_out; psize = vma_kernel_pagesize(vma); - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ if (psize >= 0x1000000) @@ -4680,7 +4680,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) return err; up_out: - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); goto out_srcu; } From e1bd0a7e248c3ce59b0509e47f035c0759fc68a3 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 26 Nov 2019 19:36:31 -0300 Subject: [PATCH 041/204] KVM: PPC: Book3E: Replace current->mm by kvm->mm Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/booke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index be9a45874194f..fd7bdb4f8f877 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -775,7 +775,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) debug = current->thread.debug; current->thread.debug = vcpu->arch.dbg_reg; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = vcpu->kvm->mm->pgd; kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); From ce477a7a1cdfc9aaafcfd03b45bde131a88d51de Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 19 Dec 2019 13:51:45 -0800 Subject: [PATCH 042/204] KVM: PPC: Add skip_page_out parameter to uvmem functions Add 'skip_page_out' parameter to kvmppc_uvmem_drop_pages() so the callers can specify whetheter or not to skip paging out pages. This will be needed in a follow-on patch that implements H_SVM_INIT_ABORT hcall. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_uvmem.h | 4 ++-- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_uvmem.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index 50204e228f167..3cf8425b9838c 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -20,7 +20,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm); unsigned long kvmppc_h_svm_init_done(struct kvm *kvm); int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn); void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, - struct kvm *kvm); + struct kvm *kvm, bool skip_page_out); #else static inline int kvmppc_uvmem_init(void) { @@ -69,6 +69,6 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) static inline void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, - struct kvm *kvm) { } + struct kvm *kvm, bool skip_page_out) { } #endif /* CONFIG_PPC_UV */ #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index da857c8ba6e40..744dba98e5d11 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -1102,7 +1102,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, unsigned int shift; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) - kvmppc_uvmem_drop_pages(memslot, kvm); + kvmppc_uvmem_drop_pages(memslot, kvm, true); if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2cf3dd8b79d25..47ffc7f1b1042 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5477,7 +5477,7 @@ static int kvmhv_svm_off(struct kvm *kvm) continue; kvm_for_each_memslot(memslot, slots) { - kvmppc_uvmem_drop_pages(memslot, kvm); + kvmppc_uvmem_drop_pages(memslot, kvm, true); uv_unregister_mem_slot(kvm->arch.lpid, memslot->id); } } diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 2de264fc31563..ffa602a97dba6 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -258,7 +258,7 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) * QEMU page table with normal PTEs from newly allocated pages. */ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, - struct kvm *kvm) + struct kvm *kvm, bool skip_page_out) { int i; struct kvmppc_uvmem_page_pvt *pvt; @@ -276,7 +276,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, uvmem_page = pfn_to_page(uvmem_pfn); pvt = uvmem_page->zone_device_data; - pvt->skip_page_out = true; + pvt->skip_page_out = skip_page_out; mutex_unlock(&kvm->arch.uvmem_lock); pfn = gfn_to_pfn(kvm, gfn); From 3a43970d55e9fd5475d3c4e5fe398ab831ec6c3a Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 6 Jan 2020 18:02:37 -0800 Subject: [PATCH 043/204] KVM: PPC: Book3S HV: Implement H_SVM_INIT_ABORT hcall Implement the H_SVM_INIT_ABORT hcall which the Ultravisor can use to abort an SVM after it has issued the H_SVM_INIT_START and before the H_SVM_INIT_DONE hcalls. This hcall could be used when Ultravisor encounters security violations or other errors when starting an SVM. Note that this hcall is different from UV_SVM_TERMINATE ucall which is used by HV to terminate/cleanup an VM that has becore secure. The H_SVM_INIT_ABORT basically undoes operations that were done since the H_SVM_INIT_START hcall - i.e page-out all the VM pages back to normal memory, and terminate the SVM. (If we do not bring the pages back to normal memory, the text/data of the VM would be stuck in secure memory and since the SVM did not go secure, its MSR_S bit will be clear and the VM wont be able to access its pages even to do a clean exit). Based on patches and discussion with Paul Mackerras, Ram Pai and Bharata Rao. Signed-off-by: Ram Pai Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Bharata B Rao Signed-off-by: Paul Mackerras --- Documentation/powerpc/ultravisor.rst | 60 +++++++++++++++++++++ arch/powerpc/include/asm/hvcall.h | 1 + arch/powerpc/include/asm/kvm_book3s_uvmem.h | 6 +++ arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_hv.c | 3 ++ arch/powerpc/kvm/book3s_hv_uvmem.c | 28 ++++++++++ 6 files changed, 99 insertions(+) diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst index 730854f73830a..363736d7fd36f 100644 --- a/Documentation/powerpc/ultravisor.rst +++ b/Documentation/powerpc/ultravisor.rst @@ -948,6 +948,66 @@ Use cases up its internal state for this virtual machine. +H_SVM_INIT_ABORT +---------------- + + Abort the process of securing an SVM. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t hypercall(const uint64_t H_SVM_INIT_ABORT) + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * H_PARAMETER on successfully cleaning up the state, + Hypervisor will return this value to the + **guest**, to indicate that the underlying + UV_ESM ultracall failed. + + * H_STATE if called after a VM has gone secure (i.e + H_SVM_INIT_DONE hypercall was successful). + + * H_UNSUPPORTED if called from a wrong context (e.g. from a + normal VM). + +Description +~~~~~~~~~~~ + + Abort the process of securing a virtual machine. This call must + be made after a prior call to ``H_SVM_INIT_START`` hypercall and + before a call to ``H_SVM_INIT_DONE``. + + On entry into this hypercall the non-volatile GPRs and FPRs are + expected to contain the values they had at the time the VM issued + the UV_ESM ultracall. Further ``SRR0`` is expected to contain the + address of the instruction after the ``UV_ESM`` ultracall and ``SRR1`` + the MSR value with which to return to the VM. + + This hypercall will cleanup any partial state that was established for + the VM since the prior ``H_SVM_INIT_START`` hypercall, including paging + out pages that were paged-into secure memory, and issue the + ``UV_SVM_TERMINATE`` ultracall to terminate the VM. + + After the partial state is cleaned up, control returns to the VM + (**not Ultravisor**), at the address specified in ``SRR0`` with the + MSR values set to the value in ``SRR1``. + +Use cases +~~~~~~~~~ + + If after a successful call to ``H_SVM_INIT_START``, the Ultravisor + encounters an error while securing a virtual machine, either due + to lack of resources or because the VM's security information could + not be validated, Ultravisor informs the Hypervisor about it. + Hypervisor should use this call to clean up any internal state for + this virtual machine and return to the VM. + H_SVM_PAGE_IN ------------- diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 13bd870609c33..e90c073e437ea 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -350,6 +350,7 @@ #define H_SVM_PAGE_OUT 0xEF04 #define H_SVM_INIT_START 0xEF08 #define H_SVM_INIT_DONE 0xEF0C +#define H_SVM_INIT_ABORT 0xEF14 /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index 3cf8425b9838c..5a9834e0e2d1d 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -19,6 +19,7 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long kvmppc_h_svm_init_start(struct kvm *kvm); unsigned long kvmppc_h_svm_init_done(struct kvm *kvm); int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn); +unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm); void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, struct kvm *kvm, bool skip_page_out); #else @@ -62,6 +63,11 @@ static inline unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) return H_UNSUPPORTED; } +static inline unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) +{ + return H_UNSUPPORTED; +} + static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) { return -EFAULT; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0a398f2321c2b..6e8b8ffd06adc 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -278,6 +278,7 @@ struct kvm_resize_hpt; /* Flag values for kvm_arch.secure_guest */ #define KVMPPC_SECURE_INIT_START 0x1 /* H_SVM_INIT_START has been called */ #define KVMPPC_SECURE_INIT_DONE 0x2 /* H_SVM_INIT_DONE completed */ +#define KVMPPC_SECURE_INIT_ABORT 0x4 /* H_SVM_INIT_ABORT issued */ struct kvm_arch { unsigned int lpid; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 47ffc7f1b1042..1118cff7f7ef2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1091,6 +1091,9 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_SVM_INIT_DONE: ret = kvmppc_h_svm_init_done(vcpu->kvm); break; + case H_SVM_INIT_ABORT: + ret = kvmppc_h_svm_init_abort(vcpu->kvm); + break; default: return RESUME_HOST; diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index ffa602a97dba6..4d1f25a3959a5 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -286,6 +286,34 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, } } +unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) +{ + int srcu_idx; + struct kvm_memory_slot *memslot; + + /* + * Expect to be called only after INIT_START and before INIT_DONE. + * If INIT_DONE was completed, use normal VM termination sequence. + */ + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return H_UNSUPPORTED; + + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return H_STATE; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) + kvmppc_uvmem_drop_pages(memslot, kvm, false); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + kvm->arch.secure_guest = 0; + uv_svm_terminate(kvm->arch.lpid); + + return H_PARAMETER; +} + /* * Get a free device PFN from the pool * From b6ae256afd32f96bec0117175b329d0dd617655e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 12 Dec 2019 20:50:55 +0100 Subject: [PATCH 044/204] KVM: arm64: Only sign-extend MMIO up to register width On AArch64 you can do a sign-extended load to either a 32-bit or 64-bit register, and we should only sign extend the register up to the width of the register as specified in the operation (by using the 32-bit Wn or 64-bit Xn register specifier). As it turns out, the architecture provides this decoding information in the SF ("Sixty-Four" -- how cute...) bit. Let's take advantage of this with the usual 32-bit/64-bit header file dance and do the right thing on AArch64 hosts. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191212195055.5541-1-christoffer.dall@arm.com --- arch/arm/include/asm/kvm_emulate.h | 5 +++++ arch/arm/include/asm/kvm_mmio.h | 2 ++ arch/arm64/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/include/asm/kvm_mmio.h | 6 ++---- virt/kvm/arm/mmio.c | 6 ++++++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 9b118516d2db4..fe55d8737a117 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -182,6 +182,11 @@ static inline bool kvm_vcpu_dabt_issext(struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & HSR_SSE; } +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) +{ + return false; +} + static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu) { return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h index 7c0eddb0adb2a..32fbf82e3ebcd 100644 --- a/arch/arm/include/asm/kvm_mmio.h +++ b/arch/arm/include/asm/kvm_mmio.h @@ -14,6 +14,8 @@ struct kvm_decode { unsigned long rt; bool sign_extend; + /* Not used on 32-bit arm */ + bool sixty_four; }; void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 5efe5ca8fecf3..f407b6bdad2eb 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -283,6 +283,11 @@ static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu) return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE); } +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) +{ + return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SF); +} + static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu) { return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h index 02b5c48fd4678..b204501a0c391 100644 --- a/arch/arm64/include/asm/kvm_mmio.h +++ b/arch/arm64/include/asm/kvm_mmio.h @@ -10,13 +10,11 @@ #include #include -/* - * This is annoying. The mmio code requires this, even if we don't - * need any decoding. To be fixed. - */ struct kvm_decode { unsigned long rt; bool sign_extend; + /* Witdth of the register accessed by the faulting instruction is 64-bits */ + bool sixty_four; }; void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index 70d3b449692cc..1bb71acd53f22 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -105,6 +105,9 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) data = (data ^ mask) - mask; } + if (!vcpu->arch.mmio_decode.sixty_four) + data = data & 0xffffffff; + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); @@ -125,6 +128,7 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) unsigned long rt; int access_size; bool sign_extend; + bool sixty_four; if (kvm_vcpu_dabt_iss1tw(vcpu)) { /* page table accesses IO mem: tell guest to fix its TTBR */ @@ -138,11 +142,13 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) *is_write = kvm_vcpu_dabt_iswrite(vcpu); sign_extend = kvm_vcpu_dabt_issext(vcpu); + sixty_four = kvm_vcpu_dabt_issf(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); *len = access_size; vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; + vcpu->arch.mmio_decode.sixty_four = sixty_four; return 0; } From 8c58be34494b7f1b2adb446e2d8beeb90e5de65b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 13 Dec 2019 10:42:37 +0100 Subject: [PATCH 045/204] KVM: arm/arm64: vgic-its: Fix restoration of unmapped collections Saving/restoring an unmapped collection is a valid scenario. For example this happens if a MAPTI command was sent, featuring an unmapped collection. At the moment the CTE fails to be restored. Only compare against the number of online vcpus if the rdist base is set. Fixes: ea1ad53e1e31a ("KVM: arm64: vgic-its: Collection table save/restore") Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20191213094237.19627-1-eric.auger@redhat.com --- virt/kvm/arm/vgic/vgic-its.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 98c7360d9fb70..17920d1b350a2 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -2475,7 +2475,8 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT); coll_id = val & KVM_ITS_CTE_ICID_MASK; - if (target_addr >= atomic_read(&kvm->online_vcpus)) + if (target_addr != COLLECTION_NOT_MAPPED && + target_addr >= atomic_read(&kvm->online_vcpus)) return -EINVAL; collection = find_collection(its, coll_id); From 5f675c56ed262103b825cbab0e96c34fe681318d Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 20 Dec 2019 19:18:33 +0800 Subject: [PATCH 046/204] KVM: arm/arm64: vgic: Handle GICR_PENDBASER.PTZ filed as RAZ Although guest will hardly read and use the PTZ (Pending Table Zero) bit in GICR_PENDBASER, let us emulate the architecture strictly. As per IHI 0069E 9.11.30, PTZ field is WO, and reads as 0. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20191220111833.1422-1-yuzenghui@huawei.com --- virt/kvm/arm/vgic/vgic-mmio-v3.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 7dfd15dbb308e..ebc218840fc22 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -414,8 +414,11 @@ static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 value = vgic_cpu->pendbaser; - return extract_bytes(vgic_cpu->pendbaser, addr & 7, len); + value &= ~GICR_PENDBASER_PTZ; + + return extract_bytes(value, addr & 7, len); } static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, From f5523423defb0d929e23813c8dd16c0131043a8c Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 28 Dec 2019 11:57:14 +0000 Subject: [PATCH 047/204] arm64: kvm: Fix IDMAP overlap with HYP VA Booting 5.4 on LX2160A reveals that KVM is non-functional: kvm: Limiting the IPA size due to kernel Virtual Address limit kvm [1]: IPA Size Limit: 43bits kvm [1]: IDMAP intersecting with HYP VA, unable to continue kvm [1]: error initializing Hyp mode: -22 Debugging shows: kvm [1]: IDMAP page: 81a26000 kvm [1]: HYP VA range: 0:22ffffffff as RAM is located at: 80000000-fbdfffff : System RAM 2080000000-237fffffff : System RAM Comparing this with the same kernel on Armada 8040 shows: kvm: Limiting the IPA size due to kernel Virtual Address limit kvm [1]: IPA Size Limit: 43bits kvm [1]: IDMAP page: 2a26000 kvm [1]: HYP VA range: 4800000000:493fffffff ... kvm [1]: Hyp mode initialized successfully which indicates that hyp_va_msb is set, and is always set to the opposite value of the idmap page to avoid the overlap. This does not happen with the LX2160A. Further debugging shows vabits_actual = 39, kva_msb = 38 on LX2160A and kva_msb = 33 on Armada 8040. Looking at the bit layout of the HYP VA, there is still one bit available for hyp_va_msb. Set this bit appropriately. This allows KVM to be functional on the LX2160A, but without any HYP VA randomisation: kvm: Limiting the IPA size due to kernel Virtual Address limit kvm [1]: IPA Size Limit: 43bits kvm [1]: IDMAP page: 81a24000 kvm [1]: HYP VA range: 4000000000:62ffffffff ... kvm [1]: Hyp mode initialized successfully Fixes: ed57cac83e05 ("arm64: KVM: Introduce EL2 VA randomisation") Signed-off-by: Russell King [maz: small additional cleanups, preserved case where the tag is legitimately 0 and we can just use the mask, Fixes tag] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/E1ilAiY-0000MA-RG@rmk-PC.armlinux.org.uk --- arch/arm64/kvm/va_layout.c | 56 +++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index dab1fea4752aa..a4f48c1ac28c0 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -13,52 +13,46 @@ #include /* - * The LSB of the random hyp VA tag or 0 if no randomization is used. + * The LSB of the HYP VA tag */ static u8 tag_lsb; /* - * The random hyp VA tag value with the region bit if hyp randomization is used + * The HYP VA tag value with the region bit */ static u64 tag_val; static u64 va_mask; +/* + * We want to generate a hyp VA with the following format (with V == + * vabits_actual): + * + * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 + * --------------------------------------------------------- + * | 0000000 | hyp_va_msb | random tag | kern linear VA | + * |--------- tag_val -----------|----- va_mask ---| + * + * which does not conflict with the idmap regions. + */ __init void kvm_compute_layout(void) { phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); u64 hyp_va_msb; - int kva_msb; /* Where is my RAM region? */ hyp_va_msb = idmap_addr & BIT(vabits_actual - 1); hyp_va_msb ^= BIT(vabits_actual - 1); - kva_msb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ + tag_lsb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ (u64)(high_memory - 1)); - if (kva_msb == (vabits_actual - 1)) { - /* - * No space in the address, let's compute the mask so - * that it covers (vabits_actual - 1) bits, and the region - * bit. The tag stays set to zero. - */ - va_mask = BIT(vabits_actual - 1) - 1; - va_mask |= hyp_va_msb; - } else { - /* - * We do have some free bits to insert a random tag. - * Hyp VAs are now created from kernel linear map VAs - * using the following formula (with V == vabits_actual): - * - * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 - * --------------------------------------------------------- - * | 0000000 | hyp_va_msb | random tag | kern linear VA | - */ - tag_lsb = kva_msb; - va_mask = GENMASK_ULL(tag_lsb - 1, 0); - tag_val = get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); - tag_val |= hyp_va_msb; - tag_val >>= tag_lsb; + va_mask = GENMASK_ULL(tag_lsb - 1, 0); + tag_val = hyp_va_msb; + + if (tag_lsb != (vabits_actual - 1)) { + /* We have some free bits to insert a random tag. */ + tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); } + tag_val >>= tag_lsb; } static u32 compute_instruction(int n, u32 rd, u32 rn) @@ -117,11 +111,11 @@ void __init kvm_update_va_mask(struct alt_instr *alt, * VHE doesn't need any address translation, let's NOP * everything. * - * Alternatively, if we don't have any spare bits in - * the address, NOP everything after masking that - * kernel VA. + * Alternatively, if the tag is zero (because the layout + * dictates it and we don't have any spare bits in the + * address), NOP everything after masking the kernel VA. */ - if (has_vhe() || (!tag_lsb && i > 0)) { + if (has_vhe() || (!tag_val && i > 0)) { updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); continue; } From c3e35409b54e8833ab936e667e3e7fcb8bdace00 Mon Sep 17 00:00:00 2001 From: Shannon Zhao Date: Mon, 2 Dec 2019 15:42:11 +0800 Subject: [PATCH 048/204] KVM: ARM: Call hyp_cpu_pm_exit at the right place It doesn't needs to call hyp_cpu_pm_exit() in init_hyp_mode() when some error occurs. hyp_cpu_pm_exit() only needs to be called in kvm_arch_init() if init_subsystems() fails. So move hyp_cpu_pm_exit() out from teardown_hyp_mode() and call it directly in kvm_arch_init(). Signed-off-by: Shannon Zhao Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1575272531-3204-1-git-send-email-shannon.zhao@linux.alibaba.com --- virt/kvm/arm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 8de4daf25097d..b5d57ed6443ce 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1537,7 +1537,6 @@ static void teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - hyp_cpu_pm_exit(); } /** @@ -1751,6 +1750,7 @@ int kvm_arch_init(void *opaque) return 0; out_hyp: + hyp_cpu_pm_exit(); if (!in_hyp_mode) teardown_hyp_mode(); out_err: From de9375634b1ef49091004d08e5cd4f68695adf0f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 13 Nov 2019 09:40:45 +0800 Subject: [PATCH 049/204] KVM: arm: Remove duplicate include Remove duplicate header which is included twice. Signed-off-by: YueHaibing Signed-off-by: Marc Zyngier Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20191113014045.15276-1-yuehaibing@huawei.com --- virt/kvm/arm/arm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index b5d57ed6443ce..efda376ab3c51 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -20,8 +20,6 @@ #include #include #include -#include -#include #define CREATE_TRACE_POINTS #include "trace.h" From 1559b7583ff6ed018c5320d1503fa80b435775f0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 17 Dec 2019 12:38:09 +0000 Subject: [PATCH 050/204] KVM: arm/arm64: Re-check VMA on detecting a poisoned page When we check for a poisoned page, we use the VMA to tell userspace about the looming disaster. But we pass a pointer to this VMA after having released the mmap_sem, which isn't a good idea. Instead, stash the shift value that goes with this pfn while we are holding the mmap_sem. Reported-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Link: https://lore.kernel.org/r/20191211165651.7889-3-maz@kernel.org Link: https://lore.kernel.org/r/20191217123809.197392-1-james.morse@arm.com --- virt/kvm/arm/mmu.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 0b32a904a1bb3..e3ad950131920 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1596,16 +1596,8 @@ static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) __invalidate_icache_guest_page(pfn, size); } -static void kvm_send_hwpoison_signal(unsigned long address, - struct vm_area_struct *vma) +static void kvm_send_hwpoison_signal(unsigned long address, short lsb) { - short lsb; - - if (is_vm_hugetlb_page(vma)) - lsb = huge_page_shift(hstate_vma(vma)); - else - lsb = PAGE_SHIFT; - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } @@ -1678,6 +1670,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; + short vma_shift; kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); @@ -1701,7 +1694,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - vma_pagesize = vma_kernel_pagesize(vma); + if (is_vm_hugetlb_page(vma)) + vma_shift = huge_page_shift(hstate_vma(vma)); + else + vma_shift = PAGE_SHIFT; + + vma_pagesize = 1ULL << vma_shift; if (logging_active || (vma->vm_flags & VM_PFNMAP) || !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { @@ -1741,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(hva, vma); + kvm_send_hwpoison_signal(hva, vma_shift); return 0; } if (is_error_noslot_pfn(pfn)) From a425372e733177eb0779748956bc16c85167af48 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 8 Jan 2020 13:43:22 +0000 Subject: [PATCH 051/204] KVM: arm64: Correct PSTATE on exception entry When KVM injects an exception into a guest, it generates the PSTATE value from scratch, configuring PSTATE.{M[4:0],DAIF}, and setting all other bits to zero. This isn't correct, as the architecture specifies that some PSTATE bits are (conditionally) cleared or set upon an exception, and others are unchanged from the original context. This patch adds logic to match the architectural behaviour. To make this simple to follow/audit/extend, documentation references are provided, and bits are configured in order of their layout in SPSR_EL2. This layout can be seen in the diagram on ARM DDI 0487E.a page C5-429. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200108134324.46500-2-mark.rutland@arm.com --- arch/arm64/include/uapi/asm/ptrace.h | 1 + arch/arm64/kvm/inject_fault.c | 70 ++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 7ed9294e20045..d1bb5b69f1ce4 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -49,6 +49,7 @@ #define PSR_SSBS_BIT 0x00001000 #define PSR_PAN_BIT 0x00400000 #define PSR_UAO_BIT 0x00800000 +#define PSR_DIT_BIT 0x01000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 #define PSR_Z_BIT 0x40000000 diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index ccdb6a051ab21..6aafc2825c1cb 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -14,9 +14,6 @@ #include #include -#define PSTATE_FAULT_BITS_64 (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \ - PSR_I_BIT | PSR_D_BIT) - #define CURRENT_EL_SP_EL0_VECTOR 0x0 #define CURRENT_EL_SP_ELx_VECTOR 0x200 #define LOWER_EL_AArch64_VECTOR 0x400 @@ -50,6 +47,69 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type) return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type; } +/* + * When an exception is taken, most PSTATE fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all + * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx + * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. + * + * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. + * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. + * + * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) +{ + unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_N_BIT); + new |= (old & PSR_Z_BIT); + new |= (old & PSR_C_BIT); + new |= (old & PSR_V_BIT); + + // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests) + + new |= (old & PSR_DIT_BIT); + + // PSTATE.UAO is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D5-2579. + + // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 + // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page D5-2578. + new |= (old & PSR_PAN_BIT); + if (!(sctlr & SCTLR_EL1_SPAN)) + new |= PSR_PAN_BIT; + + // PSTATE.SS is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D2-2452. + + // PSTATE.IL is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D1-2306. + + // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 + // See ARM DDI 0487E.a, page D13-3258 + if (sctlr & SCTLR_ELx_DSSBS) + new |= PSR_SSBS_BIT; + + // PSTATE.BTYPE is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. + + new |= PSR_D_BIT; + new |= PSR_A_BIT; + new |= PSR_I_BIT; + new |= PSR_F_BIT; + + new |= PSR_MODE_EL1h; + + return new; +} + static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) { unsigned long cpsr = *vcpu_cpsr(vcpu); @@ -59,7 +119,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; + *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); vcpu_write_spsr(vcpu, cpsr); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -94,7 +154,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; + *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); vcpu_write_spsr(vcpu, cpsr); /* From 3c2483f15499b877ccb53250d88addb8c91da147 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 8 Jan 2020 13:43:23 +0000 Subject: [PATCH 052/204] KVM: arm/arm64: Correct CPSR on exception entry When KVM injects an exception into a guest, it generates the CPSR value from scratch, configuring CPSR.{M,A,I,T,E}, and setting all other bits to zero. This isn't correct, as the architecture specifies that some CPSR bits are (conditionally) cleared or set upon an exception, and others are unchanged from the original context. This patch adds logic to match the architectural behaviour. To make this simple to follow/audit/extend, documentation references are provided, and bits are configured in order of their layout in SPSR_EL2. This layout can be seen in the diagram on ARM DDI 0487E.a page C5-426. Note that this code is used by both arm and arm64, and is intended to fuction with the SPSR_EL2 and SPSR_HYP layouts. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200108134324.46500-3-mark.rutland@arm.com --- arch/arm/include/asm/kvm_emulate.h | 12 ++++ arch/arm64/include/asm/ptrace.h | 1 + virt/kvm/arm/aarch32.c | 111 ++++++++++++++++++++++++++--- 3 files changed, 114 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index fe55d8737a117..c488c629e6c88 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -14,13 +14,25 @@ #include /* arm64 compatibility macros */ +#define PSR_AA32_MODE_FIQ FIQ_MODE +#define PSR_AA32_MODE_SVC SVC_MODE #define PSR_AA32_MODE_ABT ABT_MODE #define PSR_AA32_MODE_UND UND_MODE #define PSR_AA32_T_BIT PSR_T_BIT +#define PSR_AA32_F_BIT PSR_F_BIT #define PSR_AA32_I_BIT PSR_I_BIT #define PSR_AA32_A_BIT PSR_A_BIT #define PSR_AA32_E_BIT PSR_E_BIT #define PSR_AA32_IT_MASK PSR_IT_MASK +#define PSR_AA32_GE_MASK 0x000f0000 +#define PSR_AA32_DIT_BIT 0x00200000 +#define PSR_AA32_PAN_BIT 0x00400000 +#define PSR_AA32_SSBS_BIT 0x00800000 +#define PSR_AA32_Q_BIT PSR_Q_BIT +#define PSR_AA32_V_BIT PSR_V_BIT +#define PSR_AA32_C_BIT PSR_C_BIT +#define PSR_AA32_Z_BIT PSR_Z_BIT +#define PSR_AA32_N_BIT PSR_N_BIT unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index fbebb411ae202..bf57308fcd635 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -62,6 +62,7 @@ #define PSR_AA32_I_BIT 0x00000080 #define PSR_AA32_A_BIT 0x00000100 #define PSR_AA32_E_BIT 0x00000200 +#define PSR_AA32_PAN_BIT 0x00400000 #define PSR_AA32_SSBS_BIT 0x00800000 #define PSR_AA32_DIT_BIT 0x01000000 #define PSR_AA32_Q_BIT 0x08000000 diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index c4c57ba99e908..773cf14390813 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -10,6 +10,7 @@ * Author: Christoffer Dall */ +#include #include #include #include @@ -28,22 +29,112 @@ static const u8 return_offsets[8][2] = { [7] = { 4, 4 }, /* FIQ, unused */ }; +/* + * When an exception is taken, most CPSR fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). + * + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was + * obsoleted by the ARMv7 virtualization extensions and is RES0. + * + * For the SPSR layout seen from AArch32, see: + * - ARM DDI 0406C.d, page B1-1148 + * - ARM DDI 0487E.a, page G8-6264 + * + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: + * - ARM DDI 0487E.a, page C5-426 + * + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) +{ + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_AA32_N_BIT); + new |= (old & PSR_AA32_Z_BIT); + new |= (old & PSR_AA32_C_BIT); + new |= (old & PSR_AA32_V_BIT); + new |= (old & PSR_AA32_Q_BIT); + + // CPSR.IT[7:0] are set to zero upon any exception + // See ARM DDI 0487E.a, section G1.12.3 + // See ARM DDI 0406C.d, section B1.8.3 + + new |= (old & PSR_AA32_DIT_BIT); + + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception + // See ARM DDI 0487E.a, page G8-6244 + if (sctlr & BIT(31)) + new |= PSR_AA32_SSBS_BIT; + + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page G8-6246 + new |= (old & PSR_AA32_PAN_BIT); + if (!(sctlr & BIT(23))) + new |= PSR_AA32_PAN_BIT; + + // SS does not exist in AArch32, so ignore + + // CPSR.IL is set to zero upon any exception + // See ARM DDI 0487E.a, page G1-5527 + + new |= (old & PSR_AA32_GE_MASK); + + // CPSR.IT[7:0] are set to zero upon any exception + // See prior comment above + + // CPSR.E is set to SCTLR.EE upon any exception + // See ARM DDI 0487E.a, page G8-6245 + // See ARM DDI 0406C.d, page B4-1701 + if (sctlr & BIT(25)) + new |= PSR_AA32_E_BIT; + + // CPSR.A is unchanged upon an exception to Undefined, Supervisor + // CPSR.A is set upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_A_BIT); + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) + new |= PSR_AA32_A_BIT; + + // CPSR.I is set upon any exception + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= PSR_AA32_I_BIT; + + // CPSR.F is set upon an exception to FIQ + // CPSR.F is unchanged upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_F_BIT); + if (mode == PSR_AA32_MODE_FIQ) + new |= PSR_AA32_F_BIT; + + // CPSR.T is set to SCTLR.TE upon any exception + // See ARM DDI 0487E.a, page G8-5514 + // See ARM DDI 0406C.d, page B1-1181 + if (sctlr & BIT(30)) + new |= PSR_AA32_T_BIT; + + new |= mode; + + return new; +} + static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { - unsigned long cpsr; unsigned long new_spsr_value = *vcpu_cpsr(vcpu); bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT); u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - cpsr = mode | PSR_AA32_I_BIT; - - if (sctlr & (1 << 30)) - cpsr |= PSR_AA32_T_BIT; - if (sctlr & (1 << 25)) - cpsr |= PSR_AA32_E_BIT; - - *vcpu_cpsr(vcpu) = cpsr; + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); /* Note: These now point to the banked copies */ vcpu_write_spsr(vcpu, new_spsr_value); @@ -84,7 +175,7 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, fsr = &vcpu_cp15(vcpu, c5_DFSR); } - prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset); + prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); *far = addr; From 1cfbb484de158e378e8971ac40f3082e53ecca55 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 8 Jan 2020 13:43:24 +0000 Subject: [PATCH 053/204] KVM: arm/arm64: Correct AArch32 SPSR on exception entry Confusingly, there are three SPSR layouts that a kernel may need to deal with: (1) An AArch64 SPSR_ELx view of an AArch64 pstate (2) An AArch64 SPSR_ELx view of an AArch32 pstate (3) An AArch32 SPSR_* view of an AArch32 pstate When the KVM AArch32 support code deals with SPSR_{EL2,HYP}, it's either dealing with #2 or #3 consistently. On arm64 the PSR_AA32_* definitions match the AArch64 SPSR_ELx view, and on arm the PSR_AA32_* definitions match the AArch32 SPSR_* view. However, when we inject an exception into an AArch32 guest, we have to synthesize the AArch32 SPSR_* that the guest will see. Thus, an AArch64 host needs to synthesize layout #3 from layout #2. This patch adds a new host_spsr_to_spsr32() helper for this, and makes use of it in the KVM AArch32 support code. For arm64 we need to shuffle the DIT bit around, and remove the SS bit, while for arm we can use the value as-is. I've open-coded the bit manipulation for now to avoid having to rework the existing PSR_* definitions into PSR64_AA32_* and PSR32_AA32_* definitions. I hope to perform a more thorough refactoring in future so that we can handle pstate view manipulation more consistently across the kernel tree. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200108134324.46500-4-mark.rutland@arm.com --- arch/arm/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/include/asm/kvm_emulate.h | 32 ++++++++++++++++++++++++++++ virt/kvm/arm/aarch32.c | 6 +++--- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index c488c629e6c88..08d9805f613b6 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -53,6 +53,11 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) *__vcpu_spsr(vcpu) = v; } +static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) +{ + return spsr; +} + static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, u8 reg_num) { diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f407b6bdad2eb..53ea7637b7b23 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -219,6 +219,38 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v; } +/* + * The layout of SPSR for an AArch32 state is different when observed from an + * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 + * view given an AArch64 view. + * + * In ARM DDI 0487E.a see: + * + * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426 + * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256 + * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280 + * + * Which show the following differences: + * + * | Bit | AA64 | AA32 | Notes | + * +-----+------+------+-----------------------------| + * | 24 | DIT | J | J is RES0 in ARMv8 | + * | 21 | SS | DIT | SS doesn't exist in AArch32 | + * + * ... and all other bits are (currently) common. + */ +static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) +{ + const unsigned long overlap = BIT(24) | BIT(21); + unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT); + + spsr &= ~overlap; + + spsr |= dit << 21; + + return spsr; +} + static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) { u32 mode; diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index 773cf14390813..631d397ac81b7 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -129,15 +129,15 @@ static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); - bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT); + unsigned long spsr = *vcpu_cpsr(vcpu); + bool is_thumb = (spsr & PSR_AA32_T_BIT); u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); /* Note: These now point to the banked copies */ - vcpu_write_spsr(vcpu, new_spsr_value); + vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; /* Branch to exception vector */ From 821c10c2ae0bac5a8503cc7e961e7af90ea676eb Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 14 Jan 2020 19:22:12 +0800 Subject: [PATCH 054/204] KVM: arm/arm64: vgic-its: Properly check the unmapped coll in DISCARD handler Discard is supposed to fail if the collection is not mapped to any target redistributor. We currently check if the collection is mapped by "ite->collection" but this is incomplete (e.g., mapping a LPI to an unmapped collection also results in a non NULL ite->collection). What actually needs to be checked is its_is_collection_mapped(), let's turn to it. Also take this chance to remove an extra blank line. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200114112212.1411-1-yuzenghui@huawei.com --- virt/kvm/arm/vgic/vgic-its.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 17920d1b350a2..d53d34a33e35d 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -839,9 +839,8 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, u32 event_id = its_cmd_get_id(its_cmd); struct its_ite *ite; - ite = find_ite(its, device_id, event_id); - if (ite && ite->collection) { + if (ite && its_is_collection_mapped(ite->collection)) { /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the From 31a9b0b11b1c5264433a4fa1e1e1e8aa03954b1c Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sun, 19 Jan 2020 17:06:04 +0800 Subject: [PATCH 055/204] KVM: arm/arm64: vgic: Drop the kvm_vgic_register_mmio_region() kvm_vgic_register_mmio_region() was introduced in commit 4493b1c4866a ("KVM: arm/arm64: vgic-new: Add MMIO handling framework") but never used, and even never implemented. Remove it to avoid confusing readers. Reported-by: Haibin Wang Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200119090604.398-1-yuzenghui@huawei.com --- virt/kvm/arm/vgic/vgic-mmio.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 836f418f1ee80..5af2aefad4359 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -98,11 +98,6 @@ extern struct kvm_io_device_ops kvm_io_gic_ops; .uaccess_write = uwr, \ } -int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, - struct vgic_register_region *reg_desc, - struct vgic_io_device *region, - int nr_irqs, bool offset_private); - unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, From 1e9e2622a149e88bd636c9f8fb346a6e6aefeae0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 21 Nov 2019 11:17:11 +0800 Subject: [PATCH 056/204] KVM: VMX: FIXED+PHYSICAL mode single target IPI fastpath MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in our product observation, multicast IPIs are not as common as unicast IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc. This patch introduce a mechanism to handle certain performance-critical WRMSRs in a very early stage of KVM VMExit handler. This mechanism is specifically used for accelerating writes to x2APIC ICR that attempt to send a virtual IPI with physical destination-mode, fixed delivery-mode and single target. Which was found as one of the main causes of VMExits for Linux workloads. The reason this mechanism significantly reduce the latency of such virtual IPIs is by sending the physical IPI to the target vCPU in a very early stage of KVM VMExit handler, before host interrupts are enabled and before expensive operations such as reacquiring KVM’s SRCU lock. Latency is reduced even more when KVM is able to use APICv posted-interrupt mechanism (which allows to deliver the virtual IPI directly to target vCPU without the need to kick it to host). Testing on Xeon Skylake server: The virtual IPI latency from sender send to receiver receive reduces more than 200+ cpu cycles. Reviewed-by: Liran Alon Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 ++++++-- arch/x86/kvm/svm.c | 15 ++++++++--- arch/x86/kvm/vmx/vmx.c | 14 +++++++--- arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++-- arch/x86/kvm/x86.h | 1 + 5 files changed, 78 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 159a28512e4c3..e2b7934772431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -175,6 +175,11 @@ enum { VCPU_SREG_LDTR, }; +enum exit_fastpath_completion { + EXIT_FASTPATH_NONE, + EXIT_FASTPATH_SKIP_EMUL_INS, +}; + #include #define KVM_NR_MEM_OBJS 40 @@ -1095,7 +1100,8 @@ struct kvm_x86_ops { void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); void (*run)(struct kvm_vcpu *vcpu); - int (*handle_exit)(struct kvm_vcpu *vcpu); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); @@ -1145,7 +1151,8 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath); bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8f1b715dfde83..9583ae7ae218f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4935,7 +4935,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = control->exit_info_2; } -static int handle_exit(struct kvm_vcpu *vcpu) +static int handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -4993,7 +4994,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) __func__, svm->vmcb->control.exit_int_info, exit_code); - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { + kvm_skip_emulated_instruction(vcpu); + return 1; + } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); dump_vmcb(vcpu); @@ -6186,9 +6190,12 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath) { - + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE) + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b5a0c2e05825e..48a3af8fac0f2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5814,7 +5814,8 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; @@ -5900,7 +5901,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } - if (exit_reason < kvm_vmx_max_exit_handlers + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { + kvm_skip_emulated_instruction(vcpu); + return 1; + } else if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) { #ifdef CONFIG_RETPOLINE if (exit_reason == EXIT_REASON_MSR_WRITE) @@ -6248,7 +6252,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6256,6 +6261,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); + else if (!is_guest_mode(vcpu) && + vmx->exit_reason == EXIT_REASON_MSR_WRITE) + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static bool vmx_has_emulated_msr(int index) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93bbbce67a03e..33e165c0351d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1525,6 +1525,49 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +/* + * The fast path for frequent and performance sensitive wrmsr emulation, + * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces + * the latency of virtual IPI by avoiding the expensive bits of transitioning + * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the + * other cases which must be called after interrupts are enabled on the host. + */ +static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +{ + if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && + ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + } + + return 1; +} + +enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +{ + u32 msr = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + int ret = 0; + + switch (msr) { + case APIC_BASE_MSR + (APIC_ICR >> 4): + ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + break; + default: + return EXIT_FASTPATH_NONE; + } + + if (!ret) { + trace_kvm_msr_write(msr, data); + return EXIT_FASTPATH_SKIP_EMUL_INS; + } + + return EXIT_FASTPATH_NONE; +} +EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); + /* * Adapt set_msr() to msr_io()'s calling convention */ @@ -7995,6 +8038,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); + enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; bool req_immediate_exit = false; @@ -8241,7 +8285,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); /* * Consume any pending interrupts, including the possible source of @@ -8285,7 +8329,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_from_vapic(vcpu); vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu); + r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); return r; cancel_injection: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cab5e71f0f0fd..9805cf2c6b351 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -291,6 +291,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, bool kvm_vector_hashing_enabled(void); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); +enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ From dfd146fcae8974d40ef6dcfc047d7a1631e064d9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 21 Nov 2019 11:17:12 +0800 Subject: [PATCH 057/204] KVM: LAPIC: micro-optimize fixed mode ipi delivery This patch optimizes redundancy logic before fixed mode ipi is delivered in the fast path, broadcast handling needs to go slow path, so the delivery mode repair can be delayed to before slow path. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9d711c2451c79..79afa0bb5f410 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); kvm_for_each_vcpu(i, vcpu, kvm) { From cad23e72b7742578fad2e4ec8856d376ec8db923 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 7 Dec 2019 17:25:22 +0800 Subject: [PATCH 058/204] KVM: x86: check kvm_pit outside kvm_vm_ioctl_reinject() check kvm_pit outside kvm_vm_ioctl_reinject() to keep codestyle consistent with other kvm_pit func and prepare for futher cleanups. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33e165c0351d0..e62d2688bfec9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4703,9 +4703,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, { struct kvm_pit *pit = kvm->arch.vpit; - if (!pit) - return -ENXIO; - /* pit->pit_state.lock was overloaded to prevent userspace from getting * an inconsistent state after running multiple KVM_REINJECT_CONTROL * ioctls in parallel. Use a separate lock if that ioctl isn't rare. @@ -5072,6 +5069,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&control, argp, sizeof(control))) goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; r = kvm_vm_ioctl_reinject(kvm, &control); break; } From 668effb63de8962e931196e4ebeae8387bfe6d3c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:20 +0800 Subject: [PATCH 059/204] KVM: Fix some wrong function names in comment Fix some wrong function names in comment. mmu_check_roots is a typo for mmu_check_root, vmcs_read_any should be vmcs12_read_any and so on. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 2 +- virt/kvm/kvm_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index eb1ecd16fd220..5f3d95c18c397 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -28,7 +28,7 @@ BUILD_BUG_ON(1) /* * Keeping the fields ordered by size is an attempt at improving - * branch prediction in vmcs_read_any and vmcs_write_any. + * branch prediction in vmcs12_read_any and vmcs12_write_any. */ /* 16-bits */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3aa21bec028d6..63df3586f0629 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1117,7 +1117,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * * validation of sp->gfn happens in: * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) - * - kvm_is_visible_gfn (mmu_check_roots) + * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, slot); From 4d516fe7d38555e8fa08de5d68168b011a144fcb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:21 +0800 Subject: [PATCH 060/204] KVM: Fix some out-dated function names in comment Since commit b1346ab2afbe ("KVM: nVMX: Rename prepare_vmcs02_*_full to prepare_vmcs02_*_rare"), prepare_vmcs02_full has been renamed to prepare_vmcs02_rare. nested_vmx_merge_msr_bitmap is renamed to nested_vmx_prepare_msr_bitmap since commit c992384bde84 ("KVM: vmx: speed up MSR bitmap merge"). Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 5f3d95c18c397..cad128d1657be 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -23,7 +23,7 @@ BUILD_BUG_ON(1) * * When adding or removing fields here, note that shadowed * fields must always be synced by prepare_vmcs02, not just - * prepare_vmcs02_full. + * prepare_vmcs02_rare. */ /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 48a3af8fac0f2..a2e61ae007dde 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2008,7 +2008,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the + * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. @@ -2044,7 +2044,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the + * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ From 67b0ae43df179fb095f32a011446e7a883758877 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:22 +0800 Subject: [PATCH 061/204] KVM: Fix some comment typos and missing parentheses Fix some typos and add missing parentheses in the comments. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c7d4640b7b1c1..a48d5708f1f86 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1122,7 +1122,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; /* - * Clear apic_assist portion of f(struct hv_vp_assist_page + * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 679692b55f6d8..ea402e741bd53 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -969,7 +969,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority * interrupt, handle it in posted mode and use the following mechanism - * to find the destinaiton vCPU. + * to find the destination vCPU. * 1. For lowest-priority interrupts, store all the possible * destination vCPUs in an array. * 2. Use "guest vector % max number of destination vCPUs" to find diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7b01ef1d87e66..63ab49de324db 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3427,7 +3427,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) /* * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). + * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). * This function returns the new value we should put in vmcs12.guest_cr0. * It's not enough to just return the vmcs02 GUEST_CR0. Rather, * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a2e61ae007dde..6f7774f54f1c1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6720,7 +6720,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. + * for the guest), etc. */ if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); From 00116795aa87ca309a4cf1eaa3d82614807c8668 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:23 +0800 Subject: [PATCH 062/204] KVM: Fix some grammar mistakes Fix some grammar mistakes in the comments. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/lapic.c | 2 +- virt/kvm/kvm_main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 77538fd77dc25..7312aab33298e 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -189,7 +189,7 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, /* * Return 0 for coalesced interrupts; for edge-triggered interrupts, * this only happens if a previous edge has not been delivered due - * do masking. For level interrupts, the remote_irr field tells + * to masking. For level interrupts, the remote_irr field tells * us if the interrupt is waiting for an EOI. * * RTC is special: it is edge-triggered, but userspace likes to know diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ea402e741bd53..88c3c0c6d1e34 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -964,7 +964,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, } /* - * This routine tries to handler interrupts in posted mode, here is how + * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 63df3586f0629..f0501272268f2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -964,7 +964,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, /* * Increment the new memslot generation a second time, dropping the - * update in-progress flag and incrementing then generation based on + * update in-progress flag and incrementing the generation based on * the number of address spaces. This provides a unique and easily * identifiable generation number while the memslots are in flux. */ From 2f9f5cddb29b4fbdf2d328c7a6326d53227e6329 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:24 +0800 Subject: [PATCH 063/204] KVM: hyperv: Fix some typos in vcpu unimpl info Fix some typos in vcpu unimpl info. It should be unhandled rather than uhandled. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a48d5708f1f86..b255b9e865e51 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1059,7 +1059,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } @@ -1179,7 +1179,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } From 311497e0c5565e7d9cf7b0987d17626b228b8fec Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:25 +0800 Subject: [PATCH 064/204] KVM: Fix some writing mistakes Fix some writing mistakes in the comments. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- virt/kvm/kvm_main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e2b7934772431..0b5c280644e58 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -611,7 +611,7 @@ struct kvm_vcpu_arch { * Paging state of an L2 guest (used for nested npt) * * This context will save all necessary information to walk page tables - * of the an L2 guest. This context is only initialized for page table + * of an L2 guest. This context is only initialized for page table * walking and not for faulting since we never handle l2 page faults on * the host. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6f7774f54f1c1..c2ced79aee3e1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1914,7 +1914,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } /* - * Writes msr value into into the appropriate "register". + * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f0501272268f2..1a6d5ebd5c422 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1519,7 +1519,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can are in atomic context. + * only part that runs if we can in atomic context. */ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) From fe6ed369fca98e99df55c932b85782a5687526b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 15:24:32 -0800 Subject: [PATCH 065/204] KVM: VMX: Add non-canonical check on writes to RTIT address MSRs Reject writes to RTIT address MSRs if the data being written is a non-canonical address as the MSRs are subject to canonical checks, e.g. KVM will trigger an unchecked #GP when loading the values to hardware during pt_guest_enter(). Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c2ced79aee3e1..aea4fa957fd2a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2144,6 +2144,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges))) return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; else From e348ac7c9e34bb2109facaafd3eea65f47314a9d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 15:24:33 -0800 Subject: [PATCH 066/204] KVM: VMX: Add helper to consolidate up PT/RTIT WRMSR fault logic Add a helper to consolidate the common checks for writing PT MSRs, and opportunistically clean up the formatting of the affected code. No functional change intended. Cc: Chao Peng Cc: Luwei Kang Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 55 ++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aea4fa957fd2a..857dd0898e5f1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1057,6 +1057,12 @@ static unsigned long segment_base(u16 selector) } #endif +static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) +{ + return (pt_mode == PT_MODE_HOST_GUEST) && + !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -2102,47 +2108,48 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pt_update_intercept_for_msr(vmx); break; case MSR_IA32_RTIT_STATUS: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (data & MSR_IA32_RTIT_STATUS_MASK)) + if (!pt_can_write_msr(vmx)) + return 1; + if (data & MSR_IA32_RTIT_STATUS_MASK) return 1; vmx->pt_desc.guest.status = data; break; case MSR_IA32_RTIT_CR3_MATCH: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_cr3_filtering)) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) return 1; vmx->pt_desc.guest.cr3_match = data; break; case MSR_IA32_RTIT_OUTPUT_BASE: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (!intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_topa_output) && - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_single_range_output)) || - (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) return 1; vmx->pt_desc.guest.output_base = data; break; case MSR_IA32_RTIT_OUTPUT_MASK: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (!intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_topa_output) && - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_single_range_output))) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) return 1; vmx->pt_desc.guest.output_mask = data; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!pt_can_write_msr(vmx)) + return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges))) + if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges)) return 1; if (is_noncanonical_address(data, vcpu)) return 1; From b11306b53b2540c6ba068c4deddb6a17d9f8d95b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:13 -0800 Subject: [PATCH 067/204] KVM: x86: Don't let userspace set host-reserved cr4 bits Calculate the host-reserved cr4 bits at runtime based on the system's capabilities (using logic similar to __do_cpuid_func()), and use the dynamically generated mask for the reserved bit check in kvm_set_cr4() instead using of the static CR4_RESERVED_BITS define. This prevents userspace from "enabling" features in cr4 that are not supported by the system, e.g. by ignoring KVM_GET_SUPPORTED_CPUID and specifying a bogus CPUID for the vCPU. Allowing userspace to set unsupported bits in cr4 can lead to a variety of undesirable behavior, e.g. failed VM-Enter, and in general increases KVM's attack surface. A crafty userspace can even abuse CR4.LA57 to induce an unchecked #GP on a WRMSR. On a platform without LA57 support: KVM_SET_CPUID2 // CPUID_7_0_ECX.LA57 = 1 KVM_SET_SREGS // CR4.LA57 = 1 KVM_SET_MSRS // KERNEL_GS_BASE = 0x0004000000000000 KVM_RUN leads to a #GP when writing KERNEL_GS_BASE into hardware: unchecked MSR access error: WRMSR to 0xc0000102 (tried to write 0x0004000000000000) at rIP: 0xffffffffa00f239a (vmx_prepare_switch_to_guest+0x10a/0x1d0 [kvm_intel]) Call Trace: kvm_arch_vcpu_ioctl_run+0x671/0x1c70 [kvm] kvm_vcpu_ioctl+0x36b/0x5d0 [kvm] do_vfs_ioctl+0xa1/0x620 ksys_ioctl+0x66/0x70 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x4c/0x170 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fc08133bf47 Note, the above sequence fails VM-Enter due to invalid guest state. Userspace can allow VM-Enter to succeed (after the WRMSR #GP) by adding a KVM_SET_SREGS w/ CR4.LA57=0 after KVM_SET_MSRS, in which case KVM will technically leak the host's KERNEL_GS_BASE into the guest. But, as KERNEL_GS_BASE is a userspace-defined value/address, the leak is largely benign as a malicious userspace would simply be exposing its own data to the guest, and attacking a benevolent userspace would require multiple bugs in the userspace VMM. Cc: stable@vger.kernel.org Cc: Jun Nakajima Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e62d2688bfec9..8a907cd7b1e18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,6 +93,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif +static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ @@ -879,9 +881,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) +{ + u64 reserved_bits = CR4_RESERVED_BITS; + + if (!cpu_has(c, X86_FEATURE_XSAVE)) + reserved_bits |= X86_CR4_OSXSAVE; + + if (!cpu_has(c, X86_FEATURE_SMEP)) + reserved_bits |= X86_CR4_SMEP; + + if (!cpu_has(c, X86_FEATURE_SMAP)) + reserved_bits |= X86_CR4_SMAP; + + if (!cpu_has(c, X86_FEATURE_FSGSBASE)) + reserved_bits |= X86_CR4_FSGSBASE; + + if (!cpu_has(c, X86_FEATURE_PKU)) + reserved_bits |= X86_CR4_PKE; + + if (!cpu_has(c, X86_FEATURE_LA57) && + !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) + reserved_bits |= X86_CR4_LA57; + + if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) + reserved_bits |= X86_CR4_UMIP; + + return reserved_bits; +} + static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - if (cr4 & CR4_RESERVED_BITS) + if (cr4 & cr4_reserved_bits) return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) @@ -9400,6 +9431,8 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + if (kvm_has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that From f1cdecf5807b1a91829a2dc4f254bfe6bafd4776 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:14 -0800 Subject: [PATCH 068/204] KVM: x86: Ensure all logical CPUs have consistent reserved cr4 bits Check the current CPU's reserved cr4 bits against the mask calculated for the boot CPU to ensure consistent behavior across all CPUs. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8a907cd7b1e18..960b886e1e436 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9461,6 +9461,13 @@ void kvm_arch_hardware_unsetup(void) int kvm_arch_check_processor_compat(void) { + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + WARN_ON(!irqs_disabled()); + + if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + return -EIO; + return kvm_x86_ops->check_processor_compatibility(); } From 96be4e069c938e4a5fc7125de7e1cc7089b1adef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:15 -0800 Subject: [PATCH 069/204] KVM: x86: Drop special XSAVE handling from guest_cpuid_has() Now that KVM prevents setting host-reserved CR4 bits, drop the dedicated XSAVE check in guest_cpuid_has() in favor of open coding similar checks in the SVM/VMX XSAVES enabling flows. Note, checking boot_cpu_has(X86_FEATURE_XSAVE) in the XSAVES flows is technically redundant with respect to the CR4 reserved bit checks, e.g. XSAVES #UDs if CR4.OSXSAVE=0 and arch.xsaves_enabled is consumed if and only if CR4.OXSAVE=1 in guest. Keep (add?) the explicit boot_cpu_has() checks to help document KVM's usage of arch.xsaves_enabled. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 4 ---- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx/vmx.c | 1 + 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d78a61408243f..b82ae4d3dc71e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -93,10 +93,6 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_ { int *reg; - if (x86_feature == X86_FEATURE_XSAVE && - !static_cpu_has(X86_FEATURE_XSAVE)) - return false; - reg = guest_cpuid_get_register(vcpu, x86_feature); if (!reg) return false; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9583ae7ae218f..d399eb7bbff3e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5917,6 +5917,7 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVES); /* Update nrips enabled cache */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 857dd0898e5f1..3e732d092c401 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4049,6 +4049,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (vmx_xsaves_supported()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = + boot_cpu_has(X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); From 345599f9a292899bf5474651f3cea9b7a0576436 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:16 -0800 Subject: [PATCH 070/204] KVM: x86: Add macro to ensure reserved cr4 bits checks stay in sync Add a helper macro to generate the set of reserved cr4 bits for both host and guest to ensure that adding a check on guest capabilities is also added for host capabilities, and vice versa. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 65 ++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 960b886e1e436..456fc131c95ef 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -881,31 +881,34 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +#define __cr4_reserved_bits(__cpu_has, __c) \ +({ \ + u64 __reserved_bits = CR4_RESERVED_BITS; \ + \ + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ + __reserved_bits |= X86_CR4_OSXSAVE; \ + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ + __reserved_bits |= X86_CR4_SMEP; \ + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ + __reserved_bits |= X86_CR4_SMAP; \ + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ + __reserved_bits |= X86_CR4_FSGSBASE; \ + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ + __reserved_bits |= X86_CR4_PKE; \ + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ + __reserved_bits |= X86_CR4_LA57; \ + __reserved_bits; \ +}) + static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { - u64 reserved_bits = CR4_RESERVED_BITS; - - if (!cpu_has(c, X86_FEATURE_XSAVE)) - reserved_bits |= X86_CR4_OSXSAVE; - - if (!cpu_has(c, X86_FEATURE_SMEP)) - reserved_bits |= X86_CR4_SMEP; - - if (!cpu_has(c, X86_FEATURE_SMAP)) - reserved_bits |= X86_CR4_SMAP; - - if (!cpu_has(c, X86_FEATURE_FSGSBASE)) - reserved_bits |= X86_CR4_FSGSBASE; + u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (!cpu_has(c, X86_FEATURE_PKU)) - reserved_bits |= X86_CR4_PKE; + if (cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)) + reserved_bits &= ~X86_CR4_LA57; - if (!cpu_has(c, X86_FEATURE_LA57) && - !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) - reserved_bits |= X86_CR4_LA57; - - if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) - reserved_bits |= X86_CR4_UMIP; + if (kvm_x86_ops->umip_emulated()) + reserved_bits &= ~X86_CR4_UMIP; return reserved_bits; } @@ -915,25 +918,7 @@ static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & cr4_reserved_bits) return -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) return -EINVAL; return 0; From 5ae78e95ed0c771935d0d24291d221312524830c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:38 -0800 Subject: [PATCH 071/204] KVM: x86: Add dedicated emulator helpers for querying CPUID features Add feature-specific helpers for querying guest CPUID support from the emulator instead of having the emulator do a full CPUID and perform its own bit tests. The primary motivation is to eliminate the emulator's usage of bit() so that future patches can add more extensive build-time assertions on the usage of bit() without having to expose yet more code to the emulator. Note, providing a generic guest_cpuid_has() to the emulator doesn't work due to the existing built-time assertions in guest_cpuid_has(), which require the feature being checked to be a compile-time constant. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 4 ++++ arch/x86/kvm/emulate.c | 21 +++------------------ arch/x86/kvm/x86.c | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 77cf6c11f66bd..03946eb3e2b9e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -222,6 +222,10 @@ struct x86_emulate_ops { bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit); + bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); + void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 952d1a4f4d7ef..e9833e345a5c5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2348,12 +2348,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) { #ifdef CONFIG_X86_64 - u32 eax, ebx, ecx, edx; - - eax = 0x80000001; - ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - return edx & bit(X86_FEATURE_LM); + return ctxt->ops->guest_has_long_mode(ctxt); #else return false; #endif @@ -3618,18 +3613,11 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -#define FFL(x) bit(X86_FEATURE_##x) - static int em_movbe(struct x86_emulate_ctxt *ctxt) { - u32 ebx, ecx, edx, eax = 1; u16 tmp; - /* - * Check MOVBE is set in the guest-visible CPUID leaf. - */ - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - if (!(ecx & FFL(MOVBE))) + if (!ctxt->ops->guest_has_movbe(ctxt)) return emulate_ud(ctxt); switch (ctxt->op_bytes) { @@ -4027,10 +4015,7 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) static int check_fxsr(struct x86_emulate_ctxt *ctxt) { - u32 eax = 1, ebx, ecx = 0, edx; - - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - if (!(edx & FFL(FXSR))) + if (!ctxt->ops->guest_has_fxsr(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 456fc131c95ef..60b0d69af0f17 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6245,6 +6245,21 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); } +static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); +} + +static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); +} + +static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read(emul_to_vcpu(ctxt), reg); @@ -6322,6 +6337,9 @@ static const struct x86_emulate_ops emulate_ops = { .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .guest_has_long_mode = emulator_guest_has_long_mode, + .guest_has_movbe = emulator_guest_has_movbe, + .guest_has_fxsr = emulator_guest_has_fxsr, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, From a0a2260c12d8658e522f21ed8ece72bbdede58fd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:39 -0800 Subject: [PATCH 072/204] KVM: x86: Move bit() helper to cpuid.h Move bit() to cpuid.h in preparation for incorporating the reverse_cpuid array in bit() build-time assertions. Opportunistically use the BIT() macro instead of open-coding the shift. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 5 +++++ arch/x86/kvm/x86.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b82ae4d3dc71e..f908f76216bf6 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -55,6 +55,11 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, }; +static inline u32 bit(int bitno) +{ + return BIT(bitno & 31); +} + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9805cf2c6b351..ab715cee36533 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu) return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; From daa0d8c3a48732e5f64c69cca4c597cab1dfd455 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:40 -0800 Subject: [PATCH 073/204] KVM: x86: Add CPUID_7_1_EAX to the reverse CPUID table Add an entry for CPUID_7_1_EAX in the reserve_cpuid array in preparation for incorporating the array in bit() build-time assertions, specifically to avoid an assertion on F(AVX512_BF16) in do_cpuid_7_mask(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f908f76216bf6..5fc424d2e1fa6 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, }; static inline u32 bit(int bitno) From a7c48c3f56db8d18d15ee6d8c2c5e2da447c77cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:41 -0800 Subject: [PATCH 074/204] KVM: x86: Expand build-time assertion on reverse CPUID usage Add build-time checks to ensure KVM isn't trying to do a reverse CPUID lookup on Linux-defined feature bits, along with comments to explain the gory details of X86_FEATUREs and bit(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/cpuid.h | 33 ++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cf55629ff0ff6..ca20951fac2b4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -281,8 +281,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, return r; } -static void cpuid_mask(u32 *word, int wordnum) +static __always_inline void cpuid_mask(u32 *word, int wordnum) { + reverse_cpuid_check(wordnum); *word &= boot_cpu_data.x86_capability[wordnum]; } diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5fc424d2e1fa6..fb3c44b019ecb 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -56,18 +56,41 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, }; -static inline u32 bit(int bitno) +/* + * Reverse CPUID and its derivatives can only be used for hardware-defined + * feature words, i.e. words whose bits directly correspond to a CPUID leaf. + * Retrieving a feature bit or masking guest CPUID from a Linux-defined word + * is nonsensical as the bit number/mask is an arbitrary software-defined value + * and can't be used by KVM to query/control guest capabilities. And obviously + * the leaf being queried must have an entry in the lookup table. + */ +static __always_inline void reverse_cpuid_check(unsigned x86_leaf) { - return BIT(bitno & 31); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); + BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); +} + +/* + * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain + * the hardware defined bit number (stored in bits 4:0) and a software defined + * "word" (stored in bits 31:5). The word is used to index into arrays of + * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). + */ +static __always_inline u32 bit(int x86_feature) +{ + reverse_cpuid_check(x86_feature / 32); + return 1 << (x86_feature & 31); } static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; - BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); - BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); - + reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; } From 87382003e3555926017228452dae7e7064b0f915 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:42 -0800 Subject: [PATCH 075/204] KVM: x86: Refactor and rename bit() to feature_bit() macro Rename bit() to __feature_bit() to give it a more descriptive name, and add a macro, feature_bit(), to stuff the X68_FEATURE_ prefix to keep line lengths manageable for code that hardcodes the bit to be retrieved. No functional change intended. Cc: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 +++++--- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 42 +++++++++++++++++++++--------------------- arch/x86/kvm/x86.c | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ca20951fac2b4..74a4d9b4e61f5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -62,7 +62,7 @@ u64 kvm_supported_xcr0(void) return xcr0; } -#define F(x) bit(X86_FEATURE_##x) +#define F feature_bit int kvm_update_cpuid(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index fb3c44b019ecb..7366c618aa04e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -80,12 +80,14 @@ static __always_inline void reverse_cpuid_check(unsigned x86_leaf) * "word" (stored in bits 31:5). The word is used to index into arrays of * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). */ -static __always_inline u32 bit(int x86_feature) +static __always_inline u32 __feature_bit(int x86_feature) { reverse_cpuid_check(x86_feature / 32); return 1 << (x86_feature & 31); } +#define feature_bit(name) __feature_bit(X86_FEATURE_##name) + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; @@ -126,7 +128,7 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_ if (!reg) return false; - return *reg & bit(x86_feature); + return *reg & __feature_bit(x86_feature); } static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) @@ -135,7 +137,7 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8 reg = guest_cpuid_get_register(vcpu, x86_feature); if (reg) - *reg &= ~bit(x86_feature); + *reg &= ~__feature_bit(x86_feature); } static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d399eb7bbff3e..31d6a98f5edac 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5929,14 +5929,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } -#define F(x) bit(X86_FEATURE_##x) +#define F feature_bit static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { case 0x1: if (avic) - entry->ecx &= ~bit(X86_FEATURE_X2APIC); + entry->ecx &= ~F(X2APIC); break; case 0x80000001: if (nested) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3e732d092c401..bdbf27e92851d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6989,28 +6989,28 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); - cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); - cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); - cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); - cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); - cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); - cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); - cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); - cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); - cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); - cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); - cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); - cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); - cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); - cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); - cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); - cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); - cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); - cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); #undef cr4_fixed1_update } @@ -7144,7 +7144,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { if (func == 1 && nested) - entry->ecx |= bit(X86_FEATURE_VMX); + entry->ecx |= feature_bit(VMX); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60b0d69af0f17..3e70af42f65b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -904,7 +904,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)) + if (cpuid_ecx(0x7) & feature_bit(LA57)) reserved_bits &= ~X86_CR4_LA57; if (kvm_x86_ops->umip_emulated()) From d8010a779a2f3f92fb10d617b8aa1ddd772e987c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 14 Dec 2019 14:48:45 +0800 Subject: [PATCH 076/204] KVM: vmx: delete meaningless nested_vmx_prepare_msr_bitmap() declaration The function nested_vmx_prepare_msr_bitmap() declaration is below its implementation. So this is meaningless and should be removed. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 63ab49de324db..e038a331583c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3048,9 +3048,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); From 52918ed5fcf05d97d257f4131e19479da18f5d16 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 9 Jan 2020 17:42:16 -0600 Subject: [PATCH 077/204] KVM: SVM: Override default MMIO mask if memory encryption is enabled The KVM MMIO support uses bit 51 as the reserved bit to cause nested page faults when a guest performs MMIO. The AMD memory encryption support uses a CPUID function to define the encryption bit position. Given this, it is possible that these bits can conflict. Use svm_hardware_setup() to override the MMIO mask if memory encryption support is enabled. Various checks are performed to ensure that the mask is properly defined and rsvd_bits() is used to generate the new mask (as was done prior to the change that necessitated this patch). Fixes: 28a1f3ac1d0c ("kvm: x86: Set highest physical address bits in non-present/reserved SPTEs") Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Tom Lendacky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 31d6a98f5edac..b7c5369c79989 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1307,6 +1307,47 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1361,6 +1402,8 @@ static __init int svm_hardware_setup(void) } } + svm_adjust_mmio_mask(); + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) From f8052a053a7af2a53288901d27d6419e100ad8e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 15:06:39 -0800 Subject: [PATCH 078/204] KVM: x86/mmu: Reorder the reserved bit check in prefetch_invalid_gpte() Move the !PRESENT and !ACCESSED checks in FNAME(prefetch_invalid_gpte) above the call to is_rsvd_bits_set(). For a well behaved guest, the !PRESENT and !ACCESSED are far more likely to evaluate true than the reserved bit checks, and they do not require additional memory accesses. Before: Dump of assembler code for function paging32_prefetch_invalid_gpte: 0x0000000000044240 <+0>: callq 0x44245 0x0000000000044245 <+5>: mov %rcx,%rax 0x0000000000044248 <+8>: shr $0x7,%rax 0x000000000004424c <+12>: and $0x1,%eax 0x000000000004424f <+15>: lea 0x0(,%rax,4),%r8 0x0000000000044257 <+23>: add %r8,%rax 0x000000000004425a <+26>: mov %rcx,%r8 0x000000000004425d <+29>: and 0x120(%rsi,%rax,8),%r8 0x0000000000044265 <+37>: mov 0x170(%rsi),%rax 0x000000000004426c <+44>: shr %cl,%rax 0x000000000004426f <+47>: and $0x1,%eax 0x0000000000044272 <+50>: or %rax,%r8 0x0000000000044275 <+53>: jne 0x4427c 0x0000000000044277 <+55>: test $0x1,%cl 0x000000000004427a <+58>: jne 0x4428a 0x000000000004427c <+60>: mov %rdx,%rsi 0x000000000004427f <+63>: callq 0x44080 0x0000000000044284 <+68>: mov $0x1,%eax 0x0000000000044289 <+73>: retq 0x000000000004428a <+74>: xor %eax,%eax 0x000000000004428c <+76>: and $0x20,%ecx 0x000000000004428f <+79>: jne 0x44289 0x0000000000044291 <+81>: mov %rdx,%rsi 0x0000000000044294 <+84>: callq 0x44080 0x0000000000044299 <+89>: mov $0x1,%eax 0x000000000004429e <+94>: jmp 0x44289 End of assembler dump. After: Dump of assembler code for function paging32_prefetch_invalid_gpte: 0x0000000000044240 <+0>: callq 0x44245 0x0000000000044245 <+5>: test $0x1,%cl 0x0000000000044248 <+8>: je 0x4424f 0x000000000004424a <+10>: test $0x20,%cl 0x000000000004424d <+13>: jne 0x4425d 0x000000000004424f <+15>: mov %rdx,%rsi 0x0000000000044252 <+18>: callq 0x44080 0x0000000000044257 <+23>: mov $0x1,%eax 0x000000000004425c <+28>: retq 0x000000000004425d <+29>: mov %rcx,%rax 0x0000000000044260 <+32>: mov (%rsi),%rsi 0x0000000000044263 <+35>: shr $0x7,%rax 0x0000000000044267 <+39>: and $0x1,%eax 0x000000000004426a <+42>: lea 0x0(,%rax,4),%r8 0x0000000000044272 <+50>: add %r8,%rax 0x0000000000044275 <+53>: mov %rcx,%r8 0x0000000000044278 <+56>: and 0x120(%rsi,%rax,8),%r8 0x0000000000044280 <+64>: mov 0x170(%rsi),%rax 0x0000000000044287 <+71>: shr %cl,%rax 0x000000000004428a <+74>: and $0x1,%eax 0x000000000004428d <+77>: mov %rax,%rcx 0x0000000000044290 <+80>: xor %eax,%eax 0x0000000000044292 <+82>: or %rcx,%r8 0x0000000000044295 <+85>: je 0x4425c 0x0000000000044297 <+87>: mov %rdx,%rsi 0x000000000004429a <+90>: callq 0x44080 0x000000000004429f <+95>: mov $0x1,%eax 0x00000000000442a4 <+100>: jmp 0x4425c End of assembler dump. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index b53bed3c901cb..1fde6a1c506d6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -175,9 +175,6 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - if (!FNAME(is_present_gpte)(gpte)) goto no_present; @@ -186,6 +183,9 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; + if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + return false; no_present: From b5c3c1b3c6e95cc67910e27a1e7603d838c2ebed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 15:06:40 -0800 Subject: [PATCH 079/204] KVM: x86/mmu: Micro-optimize nEPT's bad memptype/XWR checks Rework the handling of nEPT's bad memtype/XWR checks to micro-optimize the checks as much as possible. Move the check to a separate helper, __is_bad_mt_xwr(), which allows the guest_rsvd_check usage in paging_tmpl.h to omit the check entirely for paging32/64 (bad_mt_xwr is always zero for non-nEPT) while retaining the bitwise-OR of the current code for the shadow_zero_check in walk_shadow_page_get_mmio_spte(). Add a comment for the bitwise-OR usage in the mmio spte walk to avoid future attempts to "fix" the code, which is what prompted this optimization in the first place[*]. Opportunistically remove the superfluous '!= 0' and parantheses, and use BIT_ULL() instead of open coding its equivalent. The net effect is that code generation is largely unchanged for walk_shadow_page_get_mmio_spte(), marginally better for ept_prefetch_invalid_gpte(), and significantly improved for paging32/64_prefetch_invalid_gpte(). Note, walk_shadow_page_get_mmio_spte() can't use a templated version of the memtype/XRW as it works on the host's shadow PTEs, e.g. checks that KVM hasn't borked its EPT tables. Even if it could be templated, the benefits of having a single implementation far outweight the few uops that would be saved for NPT or non-TDP paging, e.g. most compilers inline it all the way to up kvm_mmu_page_fault(). [*] https://lkml.kernel.org/r/20200108001859.25254-1-sean.j.christopherson@intel.com Cc: Jim Mattson Cc: David Laight Cc: Arvind Sankar Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 26 ++++++++++++++------------ arch/x86/kvm/mmu/paging_tmpl.h | 19 +++++++++++++++++-- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7269130ea5e28..2992ff7b42a7a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3968,20 +3968,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, static bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { - int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; + int bit7 = (pte >> 7) & 1; - return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | - ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); + return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) { - return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); -} - -static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) -{ - return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -4005,9 +3999,12 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; + struct rsvd_bits_validate *rsvd_check; int root, leaf; bool reserved = false; + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + walk_shadow_page_lockless_begin(vcpu); for (shadow_walk_init(&iterator, vcpu, addr), @@ -4022,8 +4019,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(spte)) break; - reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, - iterator.level); + /* + * Use a bitwise-OR instead of a logical-OR to aggregate the + * reserved bit and EPT's invalid memtype/XWR checks to avoid + * adding a Jcc in the loop. + */ + reserved |= __is_bad_mt_xwr(rsvd_check, spte) | + __is_rsvd_bits_set(rsvd_check, spte, iterator.level); } walk_shadow_page_lockless_end(vcpu); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1fde6a1c506d6..eaa00c4daeb11 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -128,6 +128,21 @@ static inline int FNAME(is_present_gpte)(unsigned long pte) #endif } +static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) +{ +#if PTTYPE != PTTYPE_EPT + return false; +#else + return __is_bad_mt_xwr(rsvd_check, gpte); +#endif +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || + FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -183,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; - if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; return false; @@ -400,7 +415,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } From a4d956b9390418623ae5d07933e2679c68b6f83c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 28 Dec 2019 14:25:24 +0800 Subject: [PATCH 080/204] KVM: nVMX: vmread should not set rflags to specify success in case of #PF In case writing to vmread destination operand result in a #PF, vmread should not call nested_vmx_succeed() to set rflags to specify success. Similar to as done in VMPTRST (See handle_vmptrst()). Reviewed-by: Liran Alon Signed-off-by: Miaohe Lin Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e038a331583c7..ef2d53854d158 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4799,8 +4799,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) + if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { kvm_inject_page_fault(vcpu, &e); + return 1; + } } return nested_vmx_succeed(vcpu); From e30a7d623dccdb3f880fbcad980b0cb589a1da45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jan 2020 16:12:10 -0800 Subject: [PATCH 081/204] KVM: x86/mmu: Apply max PA check for MMIO sptes to 32-bit KVM Remove the bogus 64-bit only condition from the check that disables MMIO spte optimization when the system supports the max PA, i.e. doesn't have any reserved PA bits. 32-bit KVM always uses PAE paging for the shadow MMU, and per Intel's SDM: PAE paging translates 32-bit linear addresses to 52-bit physical addresses. The kernel's restrictions on max physical addresses are limits on how much memory the kernel can reasonably use, not what physical addresses are supported by hardware. Fixes: ce88decffd17 ("KVM: MMU: mmio page fault support") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2992ff7b42a7a..57e4dbddba72f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6193,7 +6193,7 @@ static void kvm_set_mmio_spte_mask(void) * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) + if (shadow_phys_bits == 52) mask &= ~1ull; kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); From 09cbcef6c60e77af11c3f27e62ea3f291a5d436c Mon Sep 17 00:00:00 2001 From: Milan Pandurov Date: Fri, 13 Dec 2019 14:07:21 +0100 Subject: [PATCH 082/204] kvm: Refactor handling of VM debugfs files We can store reference to kvm_stats_debugfs_item instead of copying its values to kvm_stat_data. This allows us to remove duplicated code and usage of temporary kvm_stat_data inside vm_stat_get et al. Signed-off-by: Milan Pandurov Reviewed-by: Alexander Graf Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 +- virt/kvm/kvm_main.c | 142 +++++++++++++++++++-------------------- 2 files changed, 76 insertions(+), 73 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 528ab7a814ab4..5e2ec7e295dba 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1108,9 +1108,8 @@ enum kvm_stat_kind { }; struct kvm_stat_data { - int offset; - int mode; struct kvm *kvm; + struct kvm_stats_debugfs_item *dbgfs_item; }; struct kvm_stats_debugfs_item { @@ -1119,6 +1118,10 @@ struct kvm_stats_debugfs_item { enum kvm_stat_kind kind; int mode; }; + +#define KVM_DBGFS_GET_MODE(dbgfs_item) \ + ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1a6d5ebd5c422..483c683408a22 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -113,7 +113,7 @@ struct dentry *kvm_debugfs_dir; EXPORT_SYMBOL_GPL(kvm_debugfs_dir); static int kvm_debugfs_num_entries; -static const struct file_operations *stat_fops_per_vm[]; +static const struct file_operations stat_fops_per_vm; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); @@ -650,11 +650,11 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) return -ENOMEM; stat_data->kvm = kvm; - stat_data->offset = p->offset; - stat_data->mode = p->mode ? p->mode : 0644; + stat_data->dbgfs_item = p; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, - stat_data, stat_fops_per_vm[p->kind]); + debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), + kvm->debugfs_dentry, stat_data, + &stat_fops_per_vm); } return 0; } @@ -4010,8 +4010,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, return -ENOENT; if (simple_attr_open(inode, file, get, - stat_data->mode & S_IWUGO ? set : NULL, - fmt)) { + KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 + ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -4030,105 +4031,111 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file) return 0; } -static int vm_stat_get_per_vm(void *data, u64 *val) +static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) { - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; + *val = *(ulong *)((void *)kvm + offset); - *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); + return 0; +} + +static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) +{ + *(ulong *)((void *)kvm + offset) = 0; return 0; } -static int vm_stat_clear_per_vm(void *data, u64 val) +static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) { - struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; + int i; + struct kvm_vcpu *vcpu; - if (val) - return -EINVAL; + *val = 0; - *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u64 *)((void *)vcpu + offset); return 0; } -static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) +static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) { - __simple_attr_check_format("%llu\n", 0ull); - return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, - vm_stat_clear_per_vm, "%llu\n"); -} + int i; + struct kvm_vcpu *vcpu; -static const struct file_operations vm_stat_get_per_vm_fops = { - .owner = THIS_MODULE, - .open = vm_stat_get_per_vm_open, - .release = kvm_debugfs_release, - .read = simple_attr_read, - .write = simple_attr_write, - .llseek = no_llseek, -}; + kvm_for_each_vcpu(i, vcpu, kvm) + *(u64 *)((void *)vcpu + offset) = 0; + + return 0; +} -static int vcpu_stat_get_per_vm(void *data, u64 *val) +static int kvm_stat_data_get(void *data, u64 *val) { - int i; + int r = -EFAULT; struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - struct kvm_vcpu *vcpu; - - *val = 0; - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) - *val += *(u64 *)((void *)vcpu + stat_data->offset); + switch (stat_data->dbgfs_item->kind) { + case KVM_STAT_VM: + r = kvm_get_stat_per_vm(stat_data->kvm, + stat_data->dbgfs_item->offset, val); + break; + case KVM_STAT_VCPU: + r = kvm_get_stat_per_vcpu(stat_data->kvm, + stat_data->dbgfs_item->offset, val); + break; + } - return 0; + return r; } -static int vcpu_stat_clear_per_vm(void *data, u64 val) +static int kvm_stat_data_clear(void *data, u64 val) { - int i; + int r = -EFAULT; struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - struct kvm_vcpu *vcpu; if (val) return -EINVAL; - kvm_for_each_vcpu(i, vcpu, stat_data->kvm) - *(u64 *)((void *)vcpu + stat_data->offset) = 0; + switch (stat_data->dbgfs_item->kind) { + case KVM_STAT_VM: + r = kvm_clear_stat_per_vm(stat_data->kvm, + stat_data->dbgfs_item->offset); + break; + case KVM_STAT_VCPU: + r = kvm_clear_stat_per_vcpu(stat_data->kvm, + stat_data->dbgfs_item->offset); + break; + } - return 0; + return r; } -static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) +static int kvm_stat_data_open(struct inode *inode, struct file *file) { __simple_attr_check_format("%llu\n", 0ull); - return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, - vcpu_stat_clear_per_vm, "%llu\n"); + return kvm_debugfs_open(inode, file, kvm_stat_data_get, + kvm_stat_data_clear, "%llu\n"); } -static const struct file_operations vcpu_stat_get_per_vm_fops = { - .owner = THIS_MODULE, - .open = vcpu_stat_get_per_vm_open, +static const struct file_operations stat_fops_per_vm = { + .owner = THIS_MODULE, + .open = kvm_stat_data_open, .release = kvm_debugfs_release, - .read = simple_attr_read, - .write = simple_attr_write, - .llseek = no_llseek, -}; - -static const struct file_operations *stat_fops_per_vm[] = { - [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, - [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, + .read = simple_attr_read, + .write = simple_attr_write, + .llseek = no_llseek, }; static int vm_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; u64 tmp_val; *val = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + kvm_get_stat_per_vm(kvm, offset, &tmp_val); *val += tmp_val; } mutex_unlock(&kvm_lock); @@ -4139,15 +4146,13 @@ static int vm_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; if (val) return -EINVAL; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vm_stat_clear_per_vm((void *)&stat_tmp, 0); + kvm_clear_stat_per_vm(kvm, offset); } mutex_unlock(&kvm_lock); @@ -4160,14 +4165,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; u64 tmp_val; *val = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); *val += tmp_val; } mutex_unlock(&kvm_lock); @@ -4178,15 +4181,13 @@ static int vcpu_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_stat_data stat_tmp = {.offset = offset}; if (val) return -EINVAL; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - stat_tmp.kvm = kvm; - vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); + kvm_clear_stat_per_vcpu(kvm, offset); } mutex_unlock(&kvm_lock); @@ -4259,9 +4260,8 @@ static void kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - int mode = p->mode ? p->mode : 0644; - debugfs_create_file(p->name, mode, kvm_debugfs_dir, - (void *)(long)p->offset, + debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), + kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind]); } } From 56871d444bc4d7ea66708775e62e2e0926384dbc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 18 Jan 2020 20:09:03 +0100 Subject: [PATCH 083/204] KVM: x86: fix overlap between SPTE_MMIO_MASK and generation The SPTE_MMIO_MASK overlaps with the bits used to track MMIO generation number. A high enough generation number would overwrite the SPTE_SPECIAL_MASK region and cause the MMIO SPTE to be misinterpreted. Likewise, setting bits 52 and 53 would also cause an incorrect generation number to be read from the PTE, though this was partially mitigated by the (useless if it weren't for the bug) removal of SPTE_SPECIAL_MASK from the spte in get_mmio_spte_generation. Drop that removal, and replace it with a compile-time assertion. Fixes: 6eeb4ef049e7 ("KVM: x86: assign two bits to track SPTE kinds") Reported-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 57e4dbddba72f..b9052c7ba43d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -418,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte) * requires a full MMU zap). The flag is instead explicitly queried when * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) -#define MMIO_SPTE_GEN_HIGH_START 52 -#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) + static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; @@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64 spte) { u64 gen; - spte &= ~shadow_mmio_mask; - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; return gen; From 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 10 Dec 2019 15:48:29 +1100 Subject: [PATCH 084/204] tools/kvm_stat: Fix kvm_exit filter name The filter name is fixed to "exit_reason" for some kvm_exit events, no matter what architect we have. Actually, the filter name ("exit_reason") is only applicable to x86, meaning it's broken on other architects including aarch64. This fixes the issue by providing various kvm_exit filter names, depending on architect we're on. Afterwards, the variable filter name is picked and applied through ioctl(fd, SET_FILTER). Reported-by: Andrew Jones Signed-off-by: Gavin Shan Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index ad1b9e646c491..4cf93110c2596 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -270,6 +270,7 @@ class ArchX86(Arch): def __init__(self, exit_reasons): self.sc_perf_evt_open = 298 self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reason_field = 'exit_reason' self.exit_reasons = exit_reasons def debugfs_is_child(self, field): @@ -289,6 +290,7 @@ class ArchPPC(Arch): # numbers depend on the wordsize. char_ptr_size = ctypes.sizeof(ctypes.c_char_p) self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 + self.exit_reason_field = 'exit_nr' self.exit_reasons = {} def debugfs_is_child(self, field): @@ -300,6 +302,7 @@ class ArchA64(Arch): def __init__(self): self.sc_perf_evt_open = 241 self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reason_field = 'esr_ec' self.exit_reasons = AARCH64_EXIT_REASONS def debugfs_is_child(self, field): @@ -311,6 +314,7 @@ class ArchS390(Arch): def __init__(self): self.sc_perf_evt_open = 331 self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reason_field = None self.exit_reasons = None def debugfs_is_child(self, field): @@ -541,8 +545,8 @@ class TracepointProvider(Provider): """ filters = {} filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) - if ARCH.exit_reasons: - filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) + if ARCH.exit_reason_field and ARCH.exit_reasons: + filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons) return filters def _get_available_fields(self): From 99634e3ec0d4e0df28ae465b10f3613a4ceee58b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jan 2020 14:22:55 +0100 Subject: [PATCH 085/204] KVM: x86: list MSR_IA32_UCODE_REV as an emulated MSR Even if it's read-only, it can still be written to by userspace. Let them know by adding it to KVM_GET_MSR_INDEX_LIST. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3e70af42f65b3..9f24f5d168545 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1228,6 +1228,7 @@ static const u32 emulated_msrs_all[] = { MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, /* * The following list leaves out MSRs whose values are determined From 4425f567b0dd2cb5ad2b16ce03a8951d0ccf935d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jan 2020 16:14:37 +0100 Subject: [PATCH 086/204] KVM: async_pf: drop kvm_arch_async_page_present wrappers The wrappers make it less clear that the position of the call to kvm_arch_async_page_present depends on the architecture, and that only one of the two call sites will actually be active. Remove them. Cc: Andy Lutomirski Cc: Christian Borntraeger Signed-off-by: Paolo Bonzini --- virt/kvm/async_pf.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index d8ef708a2ef67..15e5b037f92d8 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -17,21 +17,6 @@ #include "async_pf.h" #include -static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ -#ifdef CONFIG_KVM_ASYNC_PF_SYNC - kvm_arch_async_page_present(vcpu, work); -#endif -} -static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ -#ifndef CONFIG_KVM_ASYNC_PF_SYNC - kvm_arch_async_page_present(vcpu, work); -#endif -} - static struct kmem_cache *async_pf_cache; int kvm_async_pf_init(void) @@ -80,7 +65,8 @@ static void async_pf_execute(struct work_struct *work) if (locked) up_read(&mm->mmap_sem); - kvm_async_page_present_sync(vcpu, apf); + if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) + kvm_arch_async_page_present(vcpu, apf); spin_lock(&vcpu->async_pf.lock); list_add_tail(&apf->link, &vcpu->async_pf.done); @@ -157,7 +143,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->async_pf.lock); kvm_arch_async_page_ready(vcpu, work); - kvm_async_page_present_async(vcpu, work); + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) + kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; From 0e20f5e25556c00ee813469d373b00abcf298708 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Dec 2019 13:25:25 +0000 Subject: [PATCH 087/204] KVM: arm/arm64: Cleanup MMIO handling Our MMIO handling is a bit odd, in the sense that it uses an intermediate per-vcpu structure to store the various decoded information that describe the access. But the same information is readily available in the HSR/ESR_EL2 field, and we actually use this field to populate the structure. Let's simplify the whole thing by getting rid of the superfluous structure and save a (tiny) bit of space in the vcpu structure. [32bit fix courtesy of Olof Johansson ] Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_emulate.h | 5 +- arch/arm/include/asm/kvm_host.h | 12 +++-- arch/arm/include/asm/kvm_hyp.h | 1 + arch/arm/include/asm/kvm_mmio.h | 28 ----------- arch/arm64/include/asm/kvm_emulate.h | 3 +- arch/arm64/include/asm/kvm_host.h | 12 +++-- arch/arm64/include/asm/kvm_mmio.h | 27 ----------- virt/kvm/arm/mmio.c | 70 +++++++++------------------- virt/kvm/arm/mmu.c | 1 - 9 files changed, 42 insertions(+), 117 deletions(-) delete mode 100644 arch/arm/include/asm/kvm_mmio.h delete mode 100644 arch/arm64/include/asm/kvm_mmio.h diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 08d9805f613b6..3944305e81df6 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -9,7 +9,6 @@ #include #include -#include #include #include @@ -220,7 +219,7 @@ static inline bool kvm_vcpu_dabt_is_cm(struct kvm_vcpu *vcpu) } /* Get Access Size from a data abort */ -static inline int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) +static inline unsigned int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) { switch ((kvm_vcpu_get_hsr(vcpu) >> 22) & 0x3) { case 0: @@ -231,7 +230,7 @@ static inline int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) return 4; default: kvm_err("Hardware is weird: SAS 0b11 is reserved\n"); - return -EFAULT; + return 4; } } diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 556cd818eccf3..bd2233805d995 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -202,9 +201,6 @@ struct kvm_vcpu_arch { /* Don't run the guest (internal implementation need) */ bool pause; - /* IO related fields */ - struct kvm_decode mmio_decode; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -300,6 +296,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, static inline void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index) {} +/* MMIO helpers */ +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); + +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa); + static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 40e9034db6013..3c1b55ecc5783 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #define __hyp_text __section(.hyp.text) notrace diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h deleted file mode 100644 index 32fbf82e3ebcd..0000000000000 --- a/arch/arm/include/asm/kvm_mmio.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#ifndef __ARM_KVM_MMIO_H__ -#define __ARM_KVM_MMIO_H__ - -#include -#include -#include - -struct kvm_decode { - unsigned long rt; - bool sign_extend; - /* Not used on 32-bit arm */ - bool sixty_four; -}; - -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); - -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); - -#endif /* __ARM_KVM_MMIO_H__ */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 53ea7637b7b23..688c63412cc27 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -341,7 +340,7 @@ static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu) return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_CM); } -static inline int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) +static inline unsigned int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) { return 1 << ((kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT); } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c61260cf63c58..f6a77ddab956f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -325,9 +324,6 @@ struct kvm_vcpu_arch { /* Don't run the guest (internal implementation need) */ bool pause; - /* IO related fields */ - struct kvm_decode mmio_decode; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -491,6 +487,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); +/* MMIO helpers */ +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); + +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa); + int kvm_perf_init(void); int kvm_perf_teardown(void); diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h deleted file mode 100644 index b204501a0c391..0000000000000 --- a/arch/arm64/include/asm/kvm_mmio.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#ifndef __ARM64_KVM_MMIO_H__ -#define __ARM64_KVM_MMIO_H__ - -#include -#include - -struct kvm_decode { - unsigned long rt; - bool sign_extend; - /* Witdth of the register accessed by the faulting instruction is 64-bits */ - bool sixty_four; -}; - -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); - -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); - -#endif /* __ARM64_KVM_MMIO_H__ */ diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index 1bb71acd53f22..aedfcff99ac58 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -5,7 +5,6 @@ */ #include -#include #include #include @@ -92,26 +91,23 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mmio_needed = 0; - if (!run->mmio.is_write) { - len = run->mmio.len; - if (len > sizeof(unsigned long)) - return -EINVAL; - + if (!kvm_vcpu_dabt_iswrite(vcpu)) { + len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); - if (vcpu->arch.mmio_decode.sign_extend && + if (kvm_vcpu_dabt_issext(vcpu) && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); data = (data ^ mask) - mask; } - if (!vcpu->arch.mmio_decode.sixty_four) + if (!kvm_vcpu_dabt_issf(vcpu)) data = data & 0xffffffff; trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); - vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* @@ -123,36 +119,6 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) -{ - unsigned long rt; - int access_size; - bool sign_extend; - bool sixty_four; - - if (kvm_vcpu_dabt_iss1tw(vcpu)) { - /* page table accesses IO mem: tell guest to fix its TTBR */ - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - - access_size = kvm_vcpu_dabt_get_as(vcpu); - if (unlikely(access_size < 0)) - return access_size; - - *is_write = kvm_vcpu_dabt_iswrite(vcpu); - sign_extend = kvm_vcpu_dabt_issext(vcpu); - sixty_four = kvm_vcpu_dabt_issf(vcpu); - rt = kvm_vcpu_dabt_get_rd(vcpu); - - *len = access_size; - vcpu->arch.mmio_decode.sign_extend = sign_extend; - vcpu->arch.mmio_decode.rt = rt; - vcpu->arch.mmio_decode.sixty_four = sixty_four; - - return 0; -} - int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa) { @@ -164,15 +130,10 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, u8 data_buf[8]; /* - * Prepare MMIO operation. First decode the syndrome data we get - * from the CPU. Then try if some in-kernel emulation feels - * responsible, otherwise let user space do its magic. + * No valid syndrome? Ask userspace for help if it has + * voluntered to do so, and bail out otherwise. */ - if (kvm_vcpu_dabt_isvalid(vcpu)) { - ret = decode_hsr(vcpu, &is_write, &len); - if (ret) - return ret; - } else { + if (!kvm_vcpu_dabt_isvalid(vcpu)) { if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { run->exit_reason = KVM_EXIT_ARM_NISV; run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); @@ -184,7 +145,20 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, return -ENOSYS; } - rt = vcpu->arch.mmio_decode.rt; + /* Page table accesses IO mem: tell guest to fix its TTBR */ + if (kvm_vcpu_dabt_iss1tw(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + + /* + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. + */ + is_write = kvm_vcpu_dabt_iswrite(vcpu); + len = kvm_vcpu_dabt_get_as(vcpu); + rt = kvm_vcpu_dabt_get_rd(vcpu); if (is_write) { data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index e3ad950131920..a4fa81d75e84d 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include From 290a6bb06de9ec24cecbb11bf4be35411d0b2625 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 20 Jan 2020 14:08:25 +0100 Subject: [PATCH 088/204] arm64: KVM: Add UAPI notes for swapped registers Two UAPI system register IDs do not derive their values from the ARM system register encodings. This is because their values were accidentally swapped. As the IDs are API, they cannot be changed. Add WARNING notes to point them out. Suggested-by: Marc Zyngier Signed-off-by: Andrew Jones [maz: turned XXX into WARNING] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200120130825.28838-1-drjones@redhat.com --- Documentation/virt/kvm/api.txt | 9 +++++++++ arch/arm64/include/uapi/asm/kvm.h | 12 ++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index ebb37b34dcfc6..3a0c819c35732 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -2196,6 +2196,15 @@ arm64 CCSIDR registers are demultiplexed by CSSELR value: arm64 system registers have the following id bit patterns: 0x6030 0000 0013 +WARNING: + Two system register IDs do not follow the specified pattern. These + are KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT, which map to + system registers CNTV_CVAL_EL0 and CNTVCT_EL0 respectively. These + two had their values accidentally swapped, which means TIMER_CVAL is + derived from the register encoding for CNTVCT_EL0 and TIMER_CNT is + derived from the register encoding for CNTV_CVAL_EL0. As this is + API, it must remain this way. + arm64 firmware pseudo-registers have the following bit pattern: 0x6030 0000 0014 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 820e5751ada71..ba85bb23f0601 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -220,10 +220,18 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2) #define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1) -/* EL0 Virtual Timer Registers */ +/* + * EL0 Virtual Timer Registers + * + * WARNING: + * KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT are not defined + * with the appropriate register encodings. Their values have been + * accidentally swapped. As this is set API, the definitions here + * must be used, rather than ones derived from the encodings. + */ #define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) -#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) +#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) /* KVM-as-firmware specific pseudo-registers */ #define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT) From 6645d8542ef922486b733d415d2bec3b0622c27e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 20 Jan 2020 12:47:06 +0000 Subject: [PATCH 089/204] arm64: KVM: Annotate guest entry/exit as a single function In an effort to clarify and simplify the annotations of assembly functions in the kernel new macros have been introduced replacing ENTRY and ENDPROC. There are separate annotations SYM_FUNC_ for normal C functions and SYM_CODE_ for other code. Currently __guest_enter and __guest_exit are annotated as standard functions but this is not entirely correct as the former doesn't do a normal return and the latter is not entered in a normal fashion. From the point of view of the hypervisor the guest entry/exit may be viewed as a single function which happens to have an eret in the middle of it so let's annotate it as such. Suggested-by: Mark Rutland Signed-off-by: Mark Brown Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200120124706.8681-1-broonie@kernel.org --- arch/arm64/kvm/hyp/entry.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index e5cc8d66bf537..5b76a89939b17 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,7 +44,7 @@ * u64 __guest_enter(struct kvm_vcpu *vcpu, * struct kvm_cpu_context *host_ctxt); */ -ENTRY(__guest_enter) +SYM_FUNC_START(__guest_enter) // x0: vcpu // x1: host context // x2-x17: clobbered by macros @@ -96,9 +96,8 @@ alternative_else_nop_endif // Do not touch any register after this! eret sb -ENDPROC(__guest_enter) -ENTRY(__guest_exit) +SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x0: return code // x1: vcpu // x2-x29,lr: vcpu regs @@ -192,4 +191,4 @@ abort_guest_exit_end: msr spsr_el2, x4 orr x0, x0, x5 1: ret -ENDPROC(__guest_exit) +SYM_FUNC_END(__guest_enter) From cf2d23e0bac9f6b5cd1cba8898f5f05ead40e530 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 21 Jan 2020 16:56:59 +1100 Subject: [PATCH 090/204] KVM: arm/arm64: Fix young bit from mmu notifier kvm_test_age_hva() is called upon mmu_notifier_test_young(), but wrong address range has been passed to handle_hva_to_gpa(). With the wrong address range, no young bits will be checked in handle_hva_to_gpa(). It means zero is always returned from mmu_notifier_test_young(). This fixes the issue by passing correct address range to the underly function handle_hva_to_gpa(), so that the hardware young (access) bit will be visited. Fixes: 35307b9a5f7e ("arm/arm64: KVM: Implement Stage-2 page aging") Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200121055659.19560-1-gshan@redhat.com --- virt/kvm/arm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index a4fa81d75e84d..8a9db95d1e421 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2144,7 +2144,8 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) if (!kvm->arch.pgd) return 0; trace_kvm_test_age_hva(hva); - return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); + return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, + kvm_test_age_hva_handler, NULL); } void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) From 018f22f95e8a6c3e27188b7317ef2c70a34cb2cd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 21 Jan 2020 12:33:55 +0000 Subject: [PATCH 091/204] KVM: arm: Fix DFSR setting for non-LPAE aarch32 guests Beata reports that KVM_SET_VCPU_EVENTS doesn't inject the expected exception to a non-LPAE aarch32 guest. The host intends to inject DFSR.FS=0x14 "IMPLEMENTATION DEFINED fault (Lockdown fault)", but the guest receives DFSR.FS=0x04 "Fault on instruction cache maintenance". This fault is hooked by do_translation_fault() since ARMv6, which goes on to silently 'handle' the exception, and restart the faulting instruction. It turns out, when TTBCR.EAE is clear DFSR is split, and FS[4] has to shuffle up to DFSR[10]. As KVM only does this in one place, fix up the static values. We now get the expected: | Unhandled fault: lock abort (0x404) at 0x9c800f00 Fixes: 74a64a981662a ("KVM: arm/arm64: Unify 32bit fault injection") Reported-by: Beata Michalska Signed-off-by: James Morse Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200121123356.203000-2-james.morse@arm.com --- virt/kvm/arm/aarch32.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index 631d397ac81b7..2da482ca7067f 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -181,10 +181,12 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, /* Give the guest an IMPLEMENTATION DEFINED exception */ is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) + if (is_lpae) { *fsr = 1 << 9 | 0x34; - else - *fsr = 0x14; + } else { + /* Surprise! DFSR's FS[4] lives in bit 10 */ + *fsr = BIT(10) | 0x4; /* 0x14 */ + } } void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) From 21aecdbd7f3ab02c9b82597dc733ee759fb8b274 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 21 Jan 2020 12:33:56 +0000 Subject: [PATCH 092/204] KVM: arm: Make inject_abt32() inject an external abort instead KVM's inject_abt64() injects an external-abort into an aarch64 guest. The KVM_CAP_ARM_INJECT_EXT_DABT is intended to do exactly this, but for an aarch32 guest inject_abt32() injects an implementation-defined exception, 'Lockdown fault'. Change this to external abort. For non-LPAE we now get the documented: | Unhandled fault: external abort on non-linefetch (0x008) at 0x9c800f00 and for LPAE: | Unhandled fault: synchronous external abort (0x210) at 0x9c800f00 Fixes: 74a64a981662a ("KVM: arm/arm64: Unify 32bit fault injection") Reported-by: Beata Michalska Signed-off-by: James Morse Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200121123356.203000-3-james.morse@arm.com --- virt/kvm/arm/aarch32.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index 2da482ca7067f..0a356aa91aa1a 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -15,6 +15,10 @@ #include #include +#define DFSR_FSC_EXTABT_LPAE 0x10 +#define DFSR_FSC_EXTABT_nLPAE 0x08 +#define DFSR_LPAE BIT(9) + /* * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. */ @@ -182,10 +186,10 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, /* Give the guest an IMPLEMENTATION DEFINED exception */ is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); if (is_lpae) { - *fsr = 1 << 9 | 0x34; + *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; } else { - /* Surprise! DFSR's FS[4] lives in bit 10 */ - *fsr = BIT(10) | 0x4; /* 0x14 */ + /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + *fsr = DFSR_FSC_EXTABT_nLPAE; } } From 6441fa6178f5456d1d4b512c08798888f99db185 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jan 2020 16:33:06 +0100 Subject: [PATCH 093/204] KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL If the guest is configured to have SPEC_CTRL but the host does not (which is a nonsensical configuration but these are not explicitly forbidden) then a host-initiated MSR write can write vmx->spec_ctrl (respectively svm->spec_ctrl) and trigger a #GP when KVM tries to restore the host value of the MSR. Add a more comprehensive check for valid bits of SPEC_CTRL, covering host CPUID flags and, since we are at it and it is more correct that way, guest CPUID flags too. For AMD, remove the unnecessary is_guest_mode check around setting the MSR interception bitmap, so that the code looks the same as for Intel. Cc: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +++------ arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b7c5369c79989..235a7e51de962 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4324,12 +4324,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; svm->spec_ctrl = data; - if (!data) break; @@ -4353,13 +4351,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + return 1; if (!data) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - if (is_guest_mode(vcpu)) - break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bdbf27e92851d..112d2314231df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1998,12 +1998,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; vmx->spec_ctrl = data; - if (!data) break; @@ -2037,7 +2035,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + return 1; if (!data) break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f24f5d168545..b690c0d707931 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10389,6 +10389,28 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) +{ + uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && + !boot_cpu_has(X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + + return bits; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ab715cee36533..dd6e34d0a8811 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -367,5 +367,6 @@ static inline bool kvm_pat_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); #endif From 1a978d9d3e72ddfa40ac60d26301b154247ee0bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:46 -0800 Subject: [PATCH 094/204] KVM: PPC: Book3S HV: Uninit vCPU if vcore creation fails Call kvm_vcpu_uninit() if vcore creation fails to avoid leaking any resources allocated by kvm_vcpu_init(), i.e. the vcpu->run page. Fixes: 371fefd6f2dc4 ("KVM: PPC: Allow book3s_hv guests to use SMT processor modes") Cc: stable@vger.kernel.org Reviewed-by: Greg Kurz Signed-off-by: Sean Christopherson Acked-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ff3f896d9081..ef6aa63b071b3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2368,7 +2368,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, mutex_unlock(&kvm->lock); if (!vcore) - goto free_vcpu; + goto uninit_vcpu; spin_lock(&vcore->lock); ++vcore->num_threads; @@ -2385,6 +2385,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, return vcpu; +uninit_vcpu: + kvm_vcpu_uninit(vcpu); free_vcpu: kmem_cache_free(kvm_vcpu_cache, vcpu); out: From cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:47 -0800 Subject: [PATCH 095/204] KVM: PPC: Book3S PR: Free shared page if mmu initialization fails Explicitly free the shared page if kvmppc_mmu_init() fails during kvmppc_core_vcpu_create(), as the page is freed only in kvmppc_core_vcpu_free(), which is not reached via kvm_vcpu_uninit(). Fixes: 96bc451a15329 ("KVM: PPC: Introduce shared page") Cc: stable@vger.kernel.org Reviewed-by: Greg Kurz Signed-off-by: Sean Christopherson Acked-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_pr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index ce4fcf76e53e9..26ca62b6d7730 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1806,10 +1806,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, err = kvmppc_mmu_init(vcpu); if (err < 0) - goto uninit_vcpu; + goto free_shared_page; return vcpu; +free_shared_page: + free_page((unsigned long)vcpu->arch.shared); uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: From 16be9ddea268ad841457a59109963fff8c9de38d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:48 -0800 Subject: [PATCH 096/204] KVM: x86: Free wbinvd_dirty_mask if vCPU creation fails Free the vCPU's wbinvd_dirty_mask if vCPU creation fails after kvm_arch_vcpu_init(), e.g. when installing the vCPU's file descriptor. Do the freeing by calling kvm_arch_vcpu_free() instead of open coding the freeing. This adds a likely superfluous, but ultimately harmless, call to kvmclock_reset(), which only clears vcpu->arch.pv_time_enabled. Using kvm_arch_vcpu_free() allows for additional cleanup in the future. Fixes: f5f48ee15c2ee ("KVM: VMX: Execute WBINVD to keep data consistency with assigned devices") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b690c0d707931..a3eeeb5f303e1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9243,7 +9243,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + kvm_arch_vcpu_free(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) From 034d8e2cb929ed73e32e2dba98cb8067eab85964 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:49 -0800 Subject: [PATCH 097/204] KVM: VMX: Allocate VPID after initializing VCPU Do VPID allocation after calling the common kvm_vcpu_init() as a step towards doing vCPU allocation (via kmem_cache_zalloc()) and calling kvm_vcpu_init() back-to-back. Squishing allocation and initialization together will eventually allow the sequence to be moved to arch-agnostic creation code. Note, the VPID is not consumed until KVM_RUN, slightly delaying its allocation should have no real function impact. VPID allocation was arbitrarily placed in the original patch, commit 2384d2b326408 ("KVM: VMX: Enable Virtual Processor Identification (VPID)"). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 112d2314231df..42171b4013a87 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6717,14 +6717,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_user_fpu; } - vmx->vpid = allocate_vpid(); - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) goto free_vcpu; err = -ENOMEM; + vmx->vpid = allocate_vpid(); + /* * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by @@ -6835,8 +6835,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_destroy_pml_buffer(vmx); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: free_vpid(vmx->vpid); +free_vcpu: kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); From 34109c0476f10c033945b630a58c087e9d0ef8a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:50 -0800 Subject: [PATCH 098/204] KVM: VMX: Use direct vcpu pointer during vCPU create/free Capture the vcpu pointer in a local varaible and replace '&vmx->vcpu' references with a direct reference to the pointer in anticipation of moving bits of the code to common x86 and passing the vcpu pointer into vmx_create_vcpu(), i.e. eliminate unnecessary noise from future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 42171b4013a87..e2da9082df89b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6682,17 +6682,17 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vmx); } static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { - int err; + struct kvm_vcpu *vcpu; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; - int i, cpu; + int i, cpu, err; BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, "struct kvm_vcpu must be at offset 0 for arch usercopy region"); @@ -6701,23 +6701,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vmx->vcpu.arch.user_fpu) { + vcpu = &vmx->vcpu; + + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; goto free_partial_vcpu; } - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vmx->vcpu.arch.guest_fpu) { + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; goto free_user_fpu; } - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_vcpu; @@ -6789,12 +6791,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; init_vmcs(vmx); - vmx_vcpu_put(&vmx->vcpu); + vmx_vcpu_put(vcpu); put_cpu(); - if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { + if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -6809,7 +6811,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, vmx_capability.ept, - kvm_vcpu_apicv_active(&vmx->vcpu)); + kvm_vcpu_apicv_active(vcpu)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); @@ -6827,19 +6829,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->ept_pointer = INVALID_PAGE; - return &vmx->vcpu; + return vcpu; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); + kvm_vcpu_uninit(vcpu); free_vpid(vmx->vpid); free_vcpu: - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); free_partial_vcpu: kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); From 7f27179a88a693f2d357860fddef65704578edf5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:51 -0800 Subject: [PATCH 099/204] KVM: SVM: Use direct vcpu pointer during vCPU create/free Capture the vcpu pointer in a local varaible and replace '&svm->vcpu' references with a direct reference to the pointer in anticipation of moving bits of the code to common x86 and passing the vcpu pointer into svm_create_vcpu(), i.e. eliminate unnecessary noise from future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 235a7e51de962..b0d9045cf1150 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2189,6 +2189,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; struct vcpu_svm *svm; struct page *page; struct page *msrpm_pages; @@ -2204,24 +2205,25 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = -ENOMEM; goto out; } + vcpu = &svm->vcpu; - svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!svm->vcpu.arch.user_fpu) { + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; goto free_partial_svm; } - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!svm->vcpu.arch.guest_fpu) { + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; goto free_user_fpu; } - err = kvm_vcpu_init(&svm->vcpu, kvm, id); + err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_svm; @@ -2265,9 +2267,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - svm_init_osvw(&svm->vcpu); + svm_init_osvw(vcpu); - return &svm->vcpu; + return vcpu; free_page4: __free_page(hsave_page); @@ -2278,11 +2280,11 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) free_page1: __free_page(page); uninit: - kvm_vcpu_uninit(&svm->vcpu); + kvm_vcpu_uninit(vcpu); free_svm: - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); free_partial_svm: kmem_cache_free(kvm_vcpu_cache, svm); out: @@ -2313,8 +2315,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, svm); } From a9dd6f09d7e54d3f58be32d7d051196f7a00e69e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:52 -0800 Subject: [PATCH 100/204] KVM: x86: Allocate vcpu struct in common x86 code Move allocation of VMX and SVM vcpus to common x86. Although the struct being allocated is technically a VMX/SVM struct, it can be interpreted directly as a 'struct kvm_vcpu' because of the pre-existing requirement that 'struct kvm_vcpu' be located at offset zero of the arch/vendor vcpu struct. Remove the message from the build-time assertions regarding placement of the struct, as compatibility with the arch usercopy region is no longer the sole dependent on 'struct kvm_vcpu' being at offset zero. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 28 +++++++++------------------- arch/x86/kvm/vmx/vmx.c | 24 ++++++++---------------- arch/x86/kvm/x86.c | 16 ++++++++++++---- 4 files changed, 30 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b5c280644e58..aa591a77072b6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1050,7 +1050,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); + int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b0d9045cf1150..319c487e22223 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2187,9 +2187,9 @@ static int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) +static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - struct kvm_vcpu *vcpu; struct vcpu_svm *svm; struct page *page; struct page *msrpm_pages; @@ -2197,22 +2197,15 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); - - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!svm) { - err = -ENOMEM; - goto out; - } - vcpu = &svm->vcpu; + BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); + svm = to_svm(vcpu); vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; - goto free_partial_svm; + goto out; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -2225,7 +2218,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_svm; + goto free_guest_fpu; err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); @@ -2269,7 +2262,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm_init_osvw(vcpu); - return vcpu; + return 0; free_page4: __free_page(hsave_page); @@ -2281,14 +2274,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) __free_page(page); uninit: kvm_vcpu_uninit(vcpu); -free_svm: +free_guest_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -free_partial_svm: - kmem_cache_free(kvm_vcpu_cache, svm); out: - return ERR_PTR(err); + return err; } static void svm_clear_current_vmcb(struct vmcb *vmcb) @@ -2317,7 +2308,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_uninit(vcpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, svm); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e2da9082df89b..2cbeb0a638aae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6684,31 +6684,24 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_uninit(vcpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, vmx); } -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - struct kvm_vcpu *vcpu; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; int i, cpu, err; - BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); - - vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!vmx) - return ERR_PTR(-ENOMEM); - - vcpu = &vmx->vcpu; + BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); + vmx = to_vmx(vcpu); vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; - goto free_partial_vcpu; + goto out; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -6829,7 +6822,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->ept_pointer = INVALID_PAGE; - return vcpu; + return 0; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); @@ -6842,9 +6835,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -free_partial_vcpu: - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); +out: + return err; } #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3eeeb5f303e1..cfcefdbe27846 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9172,26 +9172,34 @@ static void fx_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; - kvmclock_reset(vcpu); kvm_x86_ops->vcpu_free(vcpu); - free_cpumask_var(wbinvd_dirty_mask); + + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(kvm_vcpu_cache, vcpu); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; + int r; if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - vcpu = kvm_x86_ops->vcpu_create(kvm, id); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); + if (!vcpu) + return ERR_PTR(-ENOMEM); + r = kvm_x86_ops->vcpu_create(kvm, vcpu, id); + if (r) { + kmem_cache_free(kvm_vcpu_cache, vcpu); + return ERR_PTR(r); + } return vcpu; } From fc6e2a1845abfcfa335aef5ffaac664e104d72ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:53 -0800 Subject: [PATCH 101/204] KVM: x86: Move FPU allocation to common x86 code The allocation of FPU structs is identical across VMX and SVM, move it to common x86 code. Somewhat arbitrarily place the allocation so that it resides directly above the associated initialization via fx_init(), e.g. instead of retaining its position with respect to the overall vcpu creation flow. Although the names names kvm_arch_vcpu_create() and kvm_arch_vcpu_init() might suggest otherwise, x86 does not have a clean split between 'create' and 'init'. Allocating the struct immediately prior to the first use arguably improves readability *now*, and will yield even bigger improvements when kvm_arch_vcpu_init() is removed in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 25 +------------------------ arch/x86/kvm/vmx/vmx.c | 25 +------------------------ arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 48 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 319c487e22223..e8a5cd44dd59c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2200,25 +2200,9 @@ static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); - err = -ENOMEM; - goto out; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); - err = -ENOMEM; - goto free_user_fpu; - } - err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_guest_fpu; + return err; err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); @@ -2274,11 +2258,6 @@ static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, __free_page(page); uninit: kvm_vcpu_uninit(vcpu); -free_guest_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -out: return err; } @@ -2306,8 +2285,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2cbeb0a638aae..40c47d2709bb7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6682,8 +6682,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); } static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, @@ -6696,25 +6694,9 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); - err = -ENOMEM; - goto out; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); - err = -ENOMEM; - goto free_user_fpu; - } - err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; err = -ENOMEM; @@ -6831,11 +6813,6 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, uninit_vcpu: kvm_vcpu_uninit(vcpu); free_vpid(vmx->vpid); -free_vcpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -out: return err; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cfcefdbe27846..29d058db3207c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9177,6 +9177,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } @@ -9543,6 +9545,21 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { + pr_err("kvm: failed to allocate userspace's fpu\n"); + r = -ENOMEM; + goto free_wbinvd_dirty_mask; + } + + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); + r = -ENOMEM; + goto free_user_fpu; + } fx_init(vcpu); vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -9561,6 +9578,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: From d813a8ba54f94fd6a0276230bdf53c97b36c2101 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:54 -0800 Subject: [PATCH 102/204] KVM: x86: Move allocation of pio_data page down a few lines Allocate the pio_data page after creating the MMU and local APIC so that all direct memory allocations are grouped together. This allows setting the return value to -ENOMEM prior to starting the allocations instead of setting it in the fail path for every allocation. The pio_data page is only consumed when KVM_RUN is invoked, i.e. moving its allocation has no real functional impact. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29d058db3207c..50110bca7d57e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9510,18 +9510,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); - kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) - goto fail_free_pio_data; + return r; if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); @@ -9531,25 +9524,27 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } else static_key_slow_inc(&kvm_no_apic_vcpu); + r = -ENOMEM; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free_lapic; + vcpu->arch.pio_data = page_address(page); + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } + if (!vcpu->arch.mce_banks) + goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, - GFP_KERNEL_ACCOUNT)) { - r = -ENOMEM; + GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; - } vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { pr_err("kvm: failed to allocate userspace's fpu\n"); - r = -ENOMEM; goto free_wbinvd_dirty_mask; } @@ -9557,7 +9552,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) GFP_KERNEL_ACCOUNT); if (!vcpu->arch.guest_fpu) { pr_err("kvm: failed to allocate vcpu's fpu\n"); - r = -ENOMEM; goto free_user_fpu; } fx_init(vcpu); @@ -9584,13 +9578,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: return r; } From 987b2594ed5d128c95c5255a9c7755f7480bf407 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:55 -0800 Subject: [PATCH 103/204] KVM: x86: Move kvm_vcpu_init() invocation to common code Move the kvm_cpu_{un}init() calls to common x86 code as an intermediate step to removing kvm_cpu_{un}init() altogether. Note, VMX'x alloc_apic_access_page() and init_rmode_identity_map() are per-VM allocations and are intentionally kept if vCPU creation fails. They are freed by kvm_arch_destroy_vm(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 13 +++---------- arch/x86/kvm/vmx/vmx.c | 19 ++++++------------- arch/x86/kvm/x86.c | 20 +++++++++++++++----- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa591a77072b6..fff9ed6956b56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1050,7 +1050,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ - int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned id); + int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e8a5cd44dd59c..83257a7a2e376 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2187,8 +2187,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *page; @@ -2200,14 +2199,10 @@ static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) - goto uninit; + goto out; msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) @@ -2256,8 +2251,7 @@ static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); free_page1: __free_page(page); -uninit: - kvm_vcpu_uninit(vcpu); +out: return err; } @@ -2284,7 +2278,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); - kvm_vcpu_uninit(vcpu); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40c47d2709bb7..2134726b04429 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6681,11 +6681,9 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); - kvm_vcpu_uninit(vcpu); } -static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx; unsigned long *msr_bitmap; @@ -6694,10 +6692,6 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - err = -ENOMEM; vmx->vpid = allocate_vpid(); @@ -6711,7 +6705,7 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) - goto uninit_vcpu; + goto free_vpid; } BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); @@ -6756,7 +6750,7 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - if (kvm_cstate_in_guest(kvm)) { + if (kvm_cstate_in_guest(vcpu->kvm)) { vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); @@ -6772,13 +6766,13 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, vmx_vcpu_put(vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(vcpu)) { - err = alloc_apic_access_page(kvm); + err = alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } if (enable_ept && !enable_unrestricted_guest) { - err = init_rmode_identity_map(kvm); + err = init_rmode_identity_map(vcpu->kvm); if (err) goto free_vmcs; } @@ -6810,8 +6804,7 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); +free_vpid: free_vpid(vmx->vpid); return err; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 50110bca7d57e..51292843afcba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9176,6 +9176,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); + kvm_vcpu_uninit(vcpu); + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); @@ -9197,12 +9199,20 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (!vcpu) return ERR_PTR(-ENOMEM); - r = kvm_x86_ops->vcpu_create(kvm, vcpu, id); - if (r) { - kmem_cache_free(kvm_vcpu_cache, vcpu); - return ERR_PTR(r); - } + r = kvm_vcpu_init(vcpu, kvm, id); + if (r) + goto free_vcpu; + + r = kvm_x86_ops->vcpu_create(vcpu); + if (r) + goto uninit_vcpu; return vcpu; + +uninit_vcpu: + kvm_vcpu_uninit(vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); + return ERR_PTR(r); } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) From 3ec8ca29647078db11a820ab22855dd64d9a4897 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:56 -0800 Subject: [PATCH 104/204] KVM: PPC: e500mc: Add build-time assert that vcpu is at offset 0 In preparation for moving vcpu allocation to common PPC code, add an explicit, albeit redundant, build-time assert to ensure the vcpu member is located at offset 0. The assert is redundant in the sense that kvmppc_core_vcpu_create_e500() contains a functionally identical assert. The motiviation for adding the extra assert is to provide visual confirmation of the correctness of moving vcpu allocation to common code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500mc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 318e65c659992..c51f4bb086fdd 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -308,6 +308,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu; int err; + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu_e500) { err = -ENOMEM; From c50bfbdc38ec56cf8e53afb4f9ebb600dfcabd49 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:57 -0800 Subject: [PATCH 105/204] KVM: PPC: Allocate vcpu struct in common PPC code Move allocation of all flavors of PPC vCPUs to common PPC code. All variants either allocate 'struct kvm_vcpu' directly, or require that the embedded 'struct kvm_vcpu' member be located at offset 0, i.e. guarantee that the allocation can be directly interpreted as a 'struct kvm_vcpu' object. Remove the message from the build-time assertion regarding placement of the struct, as compatibility with the arch usercopy region is no longer the sole dependent on 'struct kvm_vcpu' being at offset zero. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/kvm_ppc.h | 7 ++++--- arch/powerpc/kvm/book3s.c | 5 +++-- arch/powerpc/kvm/book3s_hv.c | 20 +++++--------------- arch/powerpc/kvm/book3s_pr.c | 18 +++++------------- arch/powerpc/kvm/booke.c | 5 +++-- arch/powerpc/kvm/e500.c | 26 +++++++------------------- arch/powerpc/kvm/e500mc.c | 24 ++++++------------------ arch/powerpc/kvm/powerpc.c | 23 ++++++++++++++++++----- 8 files changed, 51 insertions(+), 77 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3d2f871241a8c..8f77ca5ace6f8 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,8 +119,8 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid, enum xlate_readwrite xlrw, struct kvmppc_pte *pte); -extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, - unsigned int id); +extern int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id); extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); extern int kvmppc_core_check_processor_compat(void); @@ -274,7 +274,8 @@ struct kvmppc_ops { void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id); + int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id); void (*vcpu_free)(struct kvm_vcpu *vcpu); int (*check_requests)(struct kvm_vcpu *vcpu); int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 58a59ee998e29..13385656b90d7 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,9 +789,10 @@ void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - return kvm->arch.kvm_ops->vcpu_create(kvm, id); + return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ef6aa63b071b3..a14fb6a9ea5d9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2271,22 +2271,16 @@ static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) } #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ -static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - struct kvm_vcpu *vcpu; int err; int core; struct kvmppc_vcore *vcore; - err = -ENOMEM; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; vcpu->arch.shared = &vcpu->arch.shregs; #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -2383,14 +2377,11 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, debugfs_vcpu_init(vcpu, id); - return vcpu; + return 0; uninit_vcpu: kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(err); + return err; } static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, @@ -2445,7 +2436,6 @@ static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 26ca62b6d7730..0d7c8a7bcb7b1 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1744,21 +1744,16 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; - struct kvm_vcpu *vcpu; int err = -ENOMEM; unsigned long p; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto free_vcpu; + goto out; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1808,7 +1803,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, if (err < 0) goto free_shared_page; - return vcpu; + return 0; free_shared_page: free_page((unsigned long)vcpu->arch.shared); @@ -1820,10 +1815,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, free_vcpu3s: #endif vfree(vcpu_book3s); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) @@ -1836,7 +1829,6 @@ static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) kfree(vcpu->arch.shadow_vcpu); #endif vfree(vcpu_book3s); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index be9a45874194f..047c9f7077043 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2114,9 +2114,10 @@ int kvmppc_core_init_vm(struct kvm *kvm) return kvm->arch.kvm_ops->init_vm(kvm); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - return kvm->arch.kvm_ops->vcpu_create(kvm, id); + return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 00649ca5fa9a0..f5dd2c7adcd43 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -433,26 +433,18 @@ static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; - struct kvm_vcpu *vcpu; int err; - BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); + vcpu_e500 = to_e500(vcpu); - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu_e500) { - err = -ENOMEM; - goto out; - } - - vcpu = &vcpu_e500->vcpu; err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { err = -ENOMEM; @@ -469,7 +461,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, goto uninit_tlb; } - return vcpu; + return 0; uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); @@ -477,10 +469,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, kvmppc_e500_id_table_free(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); -out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) @@ -491,7 +480,6 @@ static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_uninit(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } static int kvmppc_core_init_vm_e500(struct kvm *kvm) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index c51f4bb086fdd..7c0d392f667a5 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -301,28 +301,21 @@ static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; - struct kvm_vcpu *vcpu; int err; BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); - - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu_e500) { - err = -ENOMEM; - goto out; - } - vcpu = &vcpu_e500->vcpu; + vcpu_e500 = to_e500(vcpu); /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ vcpu->arch.oldpir = 0xffffffff; err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) @@ -334,17 +327,13 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, goto uninit_tlb; } - return vcpu; + return 0; uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); - -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); -out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) @@ -354,7 +343,6 @@ static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 416fb3d2a1d0b..fd978f681b664 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -723,12 +723,23 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; - vcpu = kvmppc_core_vcpu_create(kvm, id); - if (!IS_ERR(vcpu)) { - vcpu->arch.wqp = &vcpu->wq; - kvmppc_create_vcpu_debugfs(vcpu, id); - } + int err; + + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) + return ERR_PTR(-ENOMEM); + + err = kvmppc_core_vcpu_create(kvm, vcpu, id); + if (err) + goto free_vcpu; + + vcpu->arch.wqp = &vcpu->wq; + kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; + +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); + return ERR_PTR(err); } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -758,6 +769,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); + + kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) From d30769522294fbbd182659614bda16b5da231413 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:58 -0800 Subject: [PATCH 106/204] KVM: PPC: Book3S PR: Allocate book3s and shadow vcpu after common init Call kvm_vcpu_init() in kvmppc_core_vcpu_create_pr() prior to allocating the book3s and shadow_vcpu objects in preparation of moving said call to common PPC code. Although kvm_vcpu_init() has an arch callback, the callback is empty for Book3S PR, i.e. barring unseen black magic, moving the allocation has no real functional impact. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_pr.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 0d7c8a7bcb7b1..10c65d412e812 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1748,12 +1748,18 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; - int err = -ENOMEM; unsigned long p; + int err; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + return err; + + err = -ENOMEM; vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto out; + goto uninit_vcpu; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1763,14 +1769,9 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, goto free_vcpu3s; #endif - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_shadow_vcpu; - - err = -ENOMEM; p = __get_free_page(GFP_KERNEL|__GFP_ZERO); if (!p) - goto uninit_vcpu; + goto free_shadow_vcpu; vcpu->arch.shared = (void *)p; #ifdef CONFIG_PPC_BOOK3S_64 /* Always start the shared struct in native endian mode */ @@ -1807,15 +1808,14 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, free_shared_page: free_page((unsigned long)vcpu->arch.shared); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); free_shadow_vcpu: #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); free_vcpu3s: #endif vfree(vcpu_book3s); -out: +uninit_vcpu: + kvm_vcpu_uninit(vcpu); return err; } From 4dbf6fec78868ef02e8bf32834f16b22f58723f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:59 -0800 Subject: [PATCH 107/204] KVM: PPC: e500mc: Move reset of oldpir below call to kvm_vcpu_init() Move the initialization of oldpir so that the call to kvm_vcpu_init() is at the top of kvmppc_core_vcpu_create_e500mc(). oldpir is only use when loading/putting a vCPU, which currently cannot be done until after kvm_arch_vcpu_create() completes. Reording the call to kvm_vcpu_init() paves the way for moving the invocation to common PPC code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500mc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 7c0d392f667a5..6c782b8bae0d0 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -310,13 +310,13 @@ static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); vcpu_e500 = to_e500(vcpu); - /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ - vcpu->arch.oldpir = 0xffffffff; - err = kvm_vcpu_init(vcpu, kvm, id); if (err) return err; + /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ + vcpu->arch.oldpir = 0xffffffff; + err = kvmppc_e500_tlb_init(vcpu_e500); if (err) goto uninit_vcpu; From ff030fdf55732266c2d35b1a4a0baaf9ce49e9dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:00 -0800 Subject: [PATCH 108/204] KVM: PPC: Move kvm_vcpu_init() invocation to common code Move the kvm_cpu_{un}init() calls to common PPC code as an intermediate step towards removing kvm_cpu_{un}init() altogether. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/kvm_ppc.h | 6 ++---- arch/powerpc/kvm/book3s.c | 5 ++--- arch/powerpc/kvm/book3s_hv.c | 17 ++++++----------- arch/powerpc/kvm/book3s_pr.c | 13 +++---------- arch/powerpc/kvm/booke.c | 5 ++--- arch/powerpc/kvm/e500.c | 16 +++------------- arch/powerpc/kvm/e500mc.c | 12 ++---------- arch/powerpc/kvm/powerpc.c | 10 +++++++++- 8 files changed, 29 insertions(+), 55 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 8f77ca5ace6f8..bc2494e5710a2 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,8 +119,7 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid, enum xlate_readwrite xlrw, struct kvmppc_pte *pte); -extern int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id); +extern int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu); extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); extern int kvmppc_core_check_processor_compat(void); @@ -274,8 +273,7 @@ struct kvmppc_ops { void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); - int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id); + int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); int (*check_requests)(struct kvm_vcpu *vcpu); int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 13385656b90d7..3f7adcb0ff63c 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,10 +789,9 @@ void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); + return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a14fb6a9ea5d9..f4b72cef09d5b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2271,16 +2271,16 @@ static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) } #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ -static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) { int err; int core; struct kvmppc_vcore *vcore; + struct kvm *kvm; + unsigned int id; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; + kvm = vcpu->kvm; + id = vcpu->vcpu_id; vcpu->arch.shared = &vcpu->arch.shregs; #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -2362,7 +2362,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, mutex_unlock(&kvm->lock); if (!vcore) - goto uninit_vcpu; + return err; spin_lock(&vcore->lock); ++vcore->num_threads; @@ -2378,10 +2378,6 @@ static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, debugfs_vcpu_init(vcpu, id); return 0; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); - return err; } static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, @@ -2435,7 +2431,6 @@ static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); - kvm_vcpu_uninit(vcpu); } static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 10c65d412e812..d88f708d5be32 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1744,22 +1744,17 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, return r; } -static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s; unsigned long p; int err; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - err = -ENOMEM; vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto uninit_vcpu; + goto out; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1814,8 +1809,7 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, free_vcpu3s: #endif vfree(vcpu_book3s); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); +out: return err; } @@ -1824,7 +1818,6 @@ static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); - kvm_vcpu_uninit(vcpu); #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); #endif diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 047c9f7077043..d41765157f0e2 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2114,10 +2114,9 @@ int kvmppc_core_init_vm(struct kvm *kvm) return kvm->arch.kvm_ops->init_vm(kvm); } -int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); + return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index f5dd2c7adcd43..f2b4feaff6d22 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -433,8 +433,7 @@ static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, return r; } -static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500; int err; @@ -442,14 +441,8 @@ static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); vcpu_e500 = to_e500(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { - err = -ENOMEM; - goto uninit_vcpu; - } + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + return -ENOMEM; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) @@ -467,8 +460,6 @@ static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, kvmppc_e500_tlb_uninit(vcpu_e500); uninit_id: kvmppc_e500_id_table_free(vcpu_e500); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); return err; } @@ -479,7 +470,6 @@ static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); - kvm_vcpu_uninit(vcpu); } static int kvmppc_core_init_vm_e500(struct kvm *kvm) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 6c782b8bae0d0..e6b06cb2b92c8 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -301,8 +301,7 @@ static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, return r; } -static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_e500mc(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500; int err; @@ -310,16 +309,12 @@ static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); vcpu_e500 = to_e500(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ vcpu->arch.oldpir = 0xffffffff; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) - goto uninit_vcpu; + return err; vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (!vcpu->arch.shared) { @@ -331,8 +326,6 @@ static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); return err; } @@ -342,7 +335,6 @@ static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); - kvm_vcpu_uninit(vcpu); } static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index fd978f681b664..173f57e0a1b5e 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -729,14 +729,20 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (!vcpu) return ERR_PTR(-ENOMEM); - err = kvmppc_core_vcpu_create(kvm, vcpu, id); + err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_vcpu; + err = kvmppc_core_vcpu_create(vcpu); + if (err) + goto uninit_vcpu; + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; +uninit_vcpu: + kvm_vcpu_uninit(vcpu); free_vcpu: kmem_cache_free(kvm_vcpu_cache, vcpu); return ERR_PTR(err); @@ -770,6 +776,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmppc_core_vcpu_free(vcpu); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } From 5233009fab8e0a037c38a8c2b28ba4b3df203935 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:01 -0800 Subject: [PATCH 109/204] KVM: MIPS: Use kvm_vcpu_cache to allocate vCPUs For reasons unknown, MIPS configures the vCPU allocation cache but allocates vCPUs via kzalloc(). Allocate from the vCPU cache in preparation for moving vCPU allocation to common KVM code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 1109924560d8c..5f985773417ca 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -286,7 +286,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) void *gebase, *p, *handler, *refill_start, *refill_end; int i; - struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + struct kvm_vcpu *vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; @@ -401,7 +401,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) kvm_vcpu_uninit(vcpu); out_free_cpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); @@ -418,7 +418,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) From 47d51e5eb5fe0bd7440cf7f1217936e2d85c63cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:02 -0800 Subject: [PATCH 110/204] KVM: MIPS: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5f985773417ca..d72bceb10439f 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -156,7 +156,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_arch_vcpu_free(vcpu); + kvm_arch_vcpu_destroy(vcpu); } mutex_lock(&kvm->lock); @@ -407,7 +407,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) return ERR_PTR(err); } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { hrtimer_cancel(&vcpu->arch.comparecount_timer); @@ -421,11 +421,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); -} - int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { From d5279f3a882c3c5b57703b602d872d3bc7ffa51d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:03 -0800 Subject: [PATCH 111/204] KVM: PPC: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 173f57e0a1b5e..a2ba708f5cec8 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -475,7 +475,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) #endif kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_arch_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) @@ -752,7 +752,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ hrtimer_cancel(&vcpu->arch.dec_timer); @@ -781,11 +781,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return kvmppc_core_pending_dec(vcpu); From 4b8fff780b07570993a966475b7500b6964cd91e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:04 -0800 Subject: [PATCH 112/204] KVM: arm: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/arm/arm.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 8de4daf25097d..adae134cec592 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -194,7 +194,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) for (i = 0; i < KVM_MAX_VCPUS; ++i) { if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); + kvm_arch_vcpu_destroy(kvm->vcpus[i]); kvm->vcpus[i] = NULL; } } @@ -321,7 +321,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) static_branch_dec(&userspace_irqchip_in_use); @@ -333,11 +333,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return kvm_timer_is_pending(vcpu); From 9d979c7e6ff43ca3200ffcb74f57415fd633a2da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:05 -0800 Subject: [PATCH 113/204] KVM: x86: Remove spurious kvm_mmu_unload() from vcpu destruction path x86 does not load its MMU until KVM_RUN, which cannot be invoked until after vCPU creation succeeds. Given that kvm_arch_vcpu_destroy() is called if and only if vCPU creation fails, it is impossible for the MMU to be loaded. Note, the bogus kvm_mmu_unload() call was added during an unrelated refactoring of vCPU allocation, i.e. was presumably added as an opportunstic "fix" for a perceived leak. Fixes: fb3f0f51d92d1 ("KVM: Dynamically allocate vcpus") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51292843afcba..b731fc7d0306b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9259,10 +9259,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); - kvm_arch_vcpu_free(vcpu); } From 208050dac5ef4de5cb83ffcafa78499c94d0b5ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:06 -0800 Subject: [PATCH 114/204] KVM: x86: Remove spurious clearing of async #PF MSR Remove a bogus clearing of apf.msr_val from kvm_arch_vcpu_destroy(). apf.msr_val is only set to a non-zero value by kvm_pv_enable_async_pf(), which is only reachable by kvm_set_msr_common(), i.e. by writing MSR_KVM_ASYNC_PF_EN. KVM does not autonomously write said MSR, i.e. can only be written via KVM_SET_MSRS or KVM_RUN. Since KVM_SET_MSRS and KVM_RUN are vcpu ioctls, they require a valid vcpu file descriptor. kvm_arch_vcpu_destroy() is only called if KVM_CREATE_VCPU fails, and KVM declares KVM_CREATE_VCPU successful once the vcpu fd is installed and thus visible to userspace. Ergo, apf.msr_val cannot be non-zero when kvm_arch_vcpu_destroy() is called. Fixes: 344d9588a9df0 ("KVM: Add PV MSR to enable asynchronous page faults delivery.") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b731fc7d0306b..0c3633f9559d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9257,8 +9257,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - vcpu->arch.apf.msr_val = 0; - kvm_arch_vcpu_free(vcpu); } From 50b143e1b3cfb71c38bdd20dd64c98aa3528117f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:07 -0800 Subject: [PATCH 115/204] KVM: x86: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c3633f9559d5..8188d6cac5888 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9170,20 +9170,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -{ - kvmclock_reset(vcpu); - - kvm_x86_ops->vcpu_free(vcpu); - - kvm_vcpu_uninit(vcpu); - - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); -} - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { @@ -9257,7 +9243,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_free(vcpu); + kvmclock_reset(vcpu); + + kvm_x86_ops->vcpu_free(vcpu); + + kvm_vcpu_uninit(vcpu); + + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9681,7 +9676,7 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_arch_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) From fe931f12277186d1a9d38ba6729b42e8edb68988 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:08 -0800 Subject: [PATCH 116/204] KVM: Remove kvm_arch_vcpu_free() declaration Remove KVM's declaration of kvm_arch_vcpu_free() now that the function is gone from all architectures (several architectures were relying on the forward declaration). Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5e2ec7e295dba..4f7c8e2f378d5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -872,7 +872,6 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); From 897cc38eaab96d006ab17edd0f50a2f432f584cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:09 -0800 Subject: [PATCH 117/204] KVM: Add kvm_arch_vcpu_precreate() to handle pre-allocation issues Add a pre-allocation arch hook to handle checks that are currently done by arch specific code prior to allocating the vCPU object. This paves the way for moving the allocation to common KVM code. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 5 +++++ arch/powerpc/kvm/powerpc.c | 5 +++++ arch/s390/kvm/kvm-s390.c | 12 ++++++++---- arch/x86/kvm/x86.c | 14 +++++++++----- include/linux/kvm_host.h | 1 + virt/kvm/arm/arm.c | 21 +++++++++++---------- virt/kvm/kvm_main.c | 4 ++++ 7 files changed, 43 insertions(+), 19 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index d72bceb10439f..2e14455aec4ea 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -280,6 +280,11 @@ static inline void dump_handler(const char *symbol, void *start, void *end) pr_debug("\tEND(%s)\n", symbol); } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a2ba708f5cec8..998ef60ac4630 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -720,6 +720,11 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvmppc_core_flush_memslot(kvm, slot); } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d9e6bf3d54f0f..57c6838dff373 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3035,15 +3035,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return rc; } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) + return -EINVAL; + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; struct sie_page *sie_page; - int rc = -EINVAL; - - if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) - goto out; + int rc; rc = -ENOMEM; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8188d6cac5888..661e3c40529f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9170,17 +9170,21 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); + + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; int r; - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) return ERR_PTR(-ENOMEM); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f7c8e2f378d5..59ac534233610 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -874,6 +874,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index adae134cec592..af3ce2bb370d7 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -279,21 +279,22 @@ void kvm_arch_free_vm(struct kvm *kvm) vfree(kvm); } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) + return -EBUSY; + + if (id >= kvm->arch.max_vcpus) + return -EINVAL; + + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err; struct kvm_vcpu *vcpu; - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) { - err = -EBUSY; - goto out; - } - - if (id >= kvm->arch.max_vcpus) { - err = -EINVAL; - goto out; - } - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 483c683408a22..7b52207f829c9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2728,6 +2728,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) kvm->created_vcpus++; mutex_unlock(&kvm->lock); + r = kvm_arch_vcpu_precreate(kvm, id); + if (r) + goto vcpu_decrement; + vcpu = kvm_arch_vcpu_create(kvm, id); if (IS_ERR(vcpu)) { r = PTR_ERR(vcpu); From 321f8ee559d697d69efa81e8b6d4ea1e487c8bcc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:10 -0800 Subject: [PATCH 118/204] KVM: s390: Move guts of kvm_arch_vcpu_init() into kvm_arch_vcpu_create() Move all of kvm_arch_vcpu_init(), which is invoked at the very end of kvm_vcpu_init(), into kvm_arch_vcpu_create() in preparation of moving the call to kvm_vcpu_init(). Moving kvm_vcpu_init() is itself a preparatory step for moving allocation and initialization to common KVM code. No functional change inteded. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 62 ++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 57c6838dff373..0049b621e56aa 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2705,34 +2705,6 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | - KVM_SYNC_GPRS | - KVM_SYNC_ACRS | - KVM_SYNC_CRS | - KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; - kvm_s390_set_prefix(vcpu, 0); - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; - if (test_kvm_facility(vcpu->kvm, 82)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; - if (test_kvm_facility(vcpu->kvm, 133)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; - if (test_kvm_facility(vcpu->kvm, 156)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; - /* fprs can be synchronized via vrs, even if the guest has no vx. With - * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. - */ - if (MACHINE_HAS_VX) - vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; - else - vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; - - if (kvm_is_ucontrol(vcpu->kvm)) - return __kvm_ucontrol_vcpu_init(vcpu); - return 0; } @@ -3077,11 +3049,45 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) goto out_free_sie_block; + + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | + KVM_SYNC_GPRS | + KVM_SYNC_ACRS | + KVM_SYNC_CRS | + KVM_SYNC_ARCH0 | + KVM_SYNC_PFAULT; + kvm_s390_set_prefix(vcpu, 0); + if (test_kvm_facility(vcpu->kvm, 64)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; + if (test_kvm_facility(vcpu->kvm, 133)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; + /* fprs can be synchronized via vrs, even if the guest has no vx. With + * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. + */ + if (MACHINE_HAS_VX) + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; + else + vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; + + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = __kvm_ucontrol_vcpu_init(vcpu); + if (rc) + goto out_uninit_vcpu; + } + VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); return vcpu; +out_uninit_vcpu: + kvm_vcpu_uninit(vcpu); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); out_free_cpu: From a2017f17fa175b812ce7de302316f67e8f2b7db0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:11 -0800 Subject: [PATCH 119/204] KVM: s390: Invoke kvm_vcpu_init() before allocating sie_page Now that s390's implementation of kvm_arch_vcpu_init() is empty, move the call to kvm_vcpu_init() above the allocation of the sie_page. This paves the way for moving vcpu allocation and initialization into common KVM code without any associated functional change. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0049b621e56aa..1f8ba074cbd60 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3027,10 +3027,16 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (!vcpu) goto out; + rc = kvm_vcpu_init(vcpu, kvm, id); + if (rc) + goto out_free_cpu; + + rc = -ENOMEM; + BUILD_BUG_ON(sizeof(struct sie_page) != 4096); sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); if (!sie_page) - goto out_free_cpu; + goto out_uninit_vcpu; vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; @@ -3046,10 +3052,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); - rc = kvm_vcpu_init(vcpu, kvm, id); - if (rc) - goto out_free_sie_block; - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | @@ -3078,7 +3080,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (kvm_is_ucontrol(vcpu->kvm)) { rc = __kvm_ucontrol_vcpu_init(vcpu); if (rc) - goto out_uninit_vcpu; + goto out_free_sie_block; } VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, @@ -3086,10 +3088,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); return vcpu; -out_uninit_vcpu: - kvm_vcpu_uninit(vcpu); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); +out_uninit_vcpu: + kvm_vcpu_uninit(vcpu); out_free_cpu: kmem_cache_free(kvm_vcpu_cache, vcpu); out: From aaf532c57927982b2523994698f994255a722f5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:12 -0800 Subject: [PATCH 120/204] KVM: MIPS: Invoke kvm_vcpu_uninit() immediately prior to freeing vcpu Move the call to kvm_vcpu_uninit() in kvm_arch_vcpu_destroy() down a few lines so that it is invoked immediately prior to freeing the vCPU. This paves the way for moving the uninit and free sequence to common KVM code without an associated functional change. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2e14455aec4ea..73360e021259a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -416,13 +416,13 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { hrtimer_cancel(&vcpu->arch.comparecount_timer); - kvm_vcpu_uninit(vcpu); - kvm_mips_dump_stats(vcpu); kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); + + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } From d2423b347de46657c306d4c58f2e08feba4a19c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:13 -0800 Subject: [PATCH 121/204] KVM: x86: Invoke kvm_vcpu_uninit() immediately prior to freeing vcpu Move the call to kvm_vcpu_uninit() in kvm_arch_vcpu_destroy() down a few lines so that it is invoked immediately prior to freeing the vCPU. This paves the way for moving the uninit and free sequence to common KVM code without an associated functional change. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 661e3c40529f4..335762a17180f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9251,11 +9251,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); - kvm_vcpu_uninit(vcpu); - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } From 4543bdc08857e8026475a477e7ba88e461f38271 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:14 -0800 Subject: [PATCH 122/204] KVM: Introduce kvm_vcpu_destroy() Add kvm_vcpu_destroy() and wire up all architectures to call the common function instead of their arch specific implementation. The common destruction function will be used by future patches to move allocation and initialization of vCPUs to common KVM code, i.e. to free resources that are allocated by arch agnostic code. No functional change intended. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 2 +- arch/powerpc/kvm/powerpc.c | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 1 + virt/kvm/arm/arm.c | 2 +- virt/kvm/kvm_main.c | 6 ++++++ 7 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 73360e021259a..8546bc6e09e7f 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -156,7 +156,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); } mutex_lock(&kvm->lock); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 998ef60ac4630..e3e2b88d3d8bc 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -475,7 +475,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) #endif kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1f8ba074cbd60..8543d338a06a3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2541,7 +2541,7 @@ static void kvm_free_vcpus(struct kvm *kvm) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 335762a17180f..42b9149f6b405 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9680,7 +9680,7 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 59ac534233610..432827ab7623b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -581,6 +581,7 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); +void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index af3ce2bb370d7..0d8fb6973414b 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -194,7 +194,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) for (i = 0; i < KVM_MAX_VCPUS; ++i) { if (kvm->vcpus[i]) { - kvm_arch_vcpu_destroy(kvm->vcpus[i]); + kvm_vcpu_destroy(kvm->vcpus[i]); kvm->vcpus[i] = NULL; } } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7b52207f829c9..62ba25e441894 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -375,6 +375,12 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); +void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_destroy(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); + #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { From e529ef66e6b53b34f9b8caac55950c8a55c79dac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:15 -0800 Subject: [PATCH 123/204] KVM: Move vcpu alloc and init invocation to common code Now that all architectures tightly couple vcpu allocation/free with the mandatory calls to kvm_{un}init_vcpu(), move the sequences verbatim to common KVM code. Move both allocation and initialization in a single patch to eliminate thrash in arch specific code. The bisection benefits of moving the two pieces in separate patches is marginal at best, whereas the odds of introducing a transient arch specific bug are non-zero. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 33 ++++++------------------------ arch/powerpc/kvm/powerpc.c | 27 ++++--------------------- arch/s390/kvm/kvm-s390.c | 41 ++++++++++---------------------------- arch/x86/kvm/x86.c | 28 ++------------------------ include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 29 ++------------------------- virt/kvm/kvm_main.c | 21 ++++++++++++++++--- 7 files changed, 43 insertions(+), 138 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 8546bc6e09e7f..92c9321b3f959 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -285,25 +285,14 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err, size; void *gebase, *p, *handler, *refill_start, *refill_end; int i; - struct kvm_vcpu *vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - - if (!vcpu) { - err = -ENOMEM; - goto out; - } - - err = kvm_vcpu_init(vcpu, kvm, id); - - if (err) - goto out_free_cpu; - - kvm_debug("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu); + kvm_debug("kvm @ %p: create cpu %d at %p\n", + vcpu->kvm, vcpu->vcpu_id, vcpu); /* * Allocate space for host mode exception handlers that handle @@ -318,7 +307,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (!gebase) { err = -ENOMEM; - goto out_uninit_cpu; + goto out; } kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", ALIGN(size, PAGE_SIZE), gebase); @@ -397,19 +386,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.last_sched_cpu = -1; vcpu->arch.last_exec_cpu = -1; - return vcpu; + return 0; out_free_gebase: kfree(gebase); - -out_uninit_cpu: - kvm_vcpu_uninit(vcpu); - -out_free_cpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); - out: - return ERR_PTR(err); + return err; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -421,9 +403,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e3e2b88d3d8bc..fce1b4776e55f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -725,32 +725,17 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; int err; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - return ERR_PTR(-ENOMEM); - - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - err = kvmppc_core_vcpu_create(vcpu); if (err) - goto uninit_vcpu; + return err; vcpu->arch.wqp = &vcpu->wq; - kvmppc_create_vcpu_debugfs(vcpu, id); - return vcpu; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); - return ERR_PTR(err); + kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); + return 0; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -780,10 +765,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); - - kvm_vcpu_uninit(vcpu); - - kmem_cache_free(kvm_vcpu_cache, vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8543d338a06a3..9cba1e5d033bd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2530,9 +2530,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); free_page((unsigned long)(vcpu->arch.sie_block)); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -3014,29 +3011,15 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; struct sie_page *sie_page; int rc; - rc = -ENOMEM; - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - - rc = kvm_vcpu_init(vcpu, kvm, id); - if (rc) - goto out_free_cpu; - - rc = -ENOMEM; - BUILD_BUG_ON(sizeof(struct sie_page) != 4096); sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); if (!sie_page) - goto out_uninit_vcpu; + return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; @@ -3045,9 +3028,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->mso = 0; vcpu->arch.sie_block->msl = sclp.hamax; - vcpu->arch.sie_block->icpua = id; + vcpu->arch.sie_block->icpua = vcpu->vcpu_id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; + vcpu->arch.sie_block->gd = (u32)(u64)vcpu->kvm->arch.gisa_int.origin; if (vcpu->arch.sie_block->gd && sclp.has_gisaf) vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); @@ -3083,19 +3066,15 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, goto out_free_sie_block; } - VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, - vcpu->arch.sie_block); - trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + + return 0; - return vcpu; out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); -out_uninit_vcpu: - kvm_vcpu_uninit(vcpu); -out_free_cpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(rc); + return rc; } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42b9149f6b405..7bbde6f658bfd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9179,30 +9179,9 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; - int r; - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!vcpu) - return ERR_PTR(-ENOMEM); - - r = kvm_vcpu_init(vcpu, kvm, id); - if (r) - goto free_vcpu; - - r = kvm_x86_ops->vcpu_create(vcpu); - if (r) - goto uninit_vcpu; - return vcpu; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); - return ERR_PTR(r); + return kvm_x86_ops->vcpu_create(vcpu); } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) @@ -9254,9 +9233,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 432827ab7623b..405ea07068f1f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -876,7 +876,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 0d8fb6973414b..a7d661fc56837 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -290,32 +290,9 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - int err; - struct kvm_vcpu *vcpu; - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) { - err = -ENOMEM; - goto out; - } - - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - - err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); - if (err) - goto vcpu_uninit; - - return vcpu; -vcpu_uninit: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(err); + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -330,8 +307,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 62ba25e441894..c84df40518c43 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -378,6 +378,9 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); + + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); @@ -2738,12 +2741,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_decrement; - vcpu = kvm_arch_vcpu_create(kvm, id); - if (IS_ERR(vcpu)) { - r = PTR_ERR(vcpu); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) { + r = -ENOMEM; goto vcpu_decrement; } + r = kvm_vcpu_init(vcpu, kvm, id); + if (r) + goto vcpu_free; + + r = kvm_arch_vcpu_create(vcpu); + if (r) + goto vcpu_uninit; + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); r = kvm_arch_vcpu_setup(vcpu); @@ -2787,6 +2798,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) debugfs_remove_recursive(vcpu->debugfs_dentry); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); +vcpu_uninit: + kvm_vcpu_uninit(vcpu); +vcpu_free: + kmem_cache_free(kvm_vcpu_cache, vcpu); vcpu_decrement: mutex_lock(&kvm->lock); kvm->created_vcpus--; From aaba298c6bca8d8625880a8016e5b80adc8a11af Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:16 -0800 Subject: [PATCH 124/204] KVM: Unexport kvm_vcpu_cache and kvm_vcpu_{un}init() Unexport kvm_vcpu_cache and kvm_vcpu_{un}init() and make them static now that they are referenced only in kvm_main.c. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ---- virt/kvm/kvm_main.c | 9 +++------ 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 405ea07068f1f..521f17cd2b260 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -157,8 +157,6 @@ static inline bool is_error_page(struct page *page) #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 -extern struct kmem_cache *kvm_vcpu_cache; - extern struct mutex kvm_lock; extern struct list_head vm_list; @@ -579,8 +577,6 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ memslot++) -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); void vcpu_load(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c84df40518c43..44b4e86646435 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,8 +104,7 @@ static cpumask_var_t cpus_hardware_enabled; static int kvm_usage_count; static atomic_t hardware_enable_failed; -struct kmem_cache *kvm_vcpu_cache; -EXPORT_SYMBOL_GPL(kvm_vcpu_cache); +static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; @@ -322,7 +321,7 @@ void kvm_reload_remote_mmus(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; int r; @@ -360,9 +359,8 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) fail: return r; } -EXPORT_SYMBOL_GPL(kvm_vcpu_init); -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { /* * no need for rcu_read_lock as VCPU_RUN is the only place that @@ -373,7 +371,6 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } -EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { From d5c48debcc4fbdd09e3e0871980b7ebf06ec6e07 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:17 -0800 Subject: [PATCH 125/204] KVM: Move initialization of preempt notifier to kvm_vcpu_init() Initialize the preempt notifier immediately in kvm_vcpu_init() to pave the way for removing kvm_arch_vcpu_setup(), i.e. to allow arch specific code to call vcpu_load() during kvm_arch_vcpu_create(). Back when preemption support was added, the location of the call to init the preempt notifier was perfectly sane. The overall vCPU creation flow featured a single arch specific hook and the preempt notifer was used immediately after its initialization (by vcpu_load()). E.g.: vcpu = kvm_arch_ops->vcpu_create(kvm, n); if (IS_ERR(vcpu)) return PTR_ERR(vcpu); preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu_load(vcpu); r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); if (r < 0) goto free_vcpu; Today, the call to preempt_notifier_init() is sandwiched between two arch specific calls, kvm_arch_vcpu_create() and kvm_arch_vcpu_setup(), which needlessly forces x86 (and possibly others?) to split its vCPU creation flow. Init the preempt notifier prior to any arch specific call so that each arch can independently decide how best to organize its creation flow. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 44b4e86646435..1ddb6d4cfbfd0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -348,6 +348,7 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); r = kvm_arch_vcpu_init(vcpu); if (r < 0) @@ -2752,8 +2753,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_uninit; - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - r = kvm_arch_vcpu_setup(vcpu); if (r) goto vcpu_destroy; From 5f73db112e597b30efb7f81ab5fee87a9febad3e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:18 -0800 Subject: [PATCH 126/204] KVM: x86: Move guts of kvm_arch_vcpu_setup() into kvm_arch_vcpu_create() Fold setup() into create() now that the two are called back-to-back by common KVM code. This paves the way for removing kvm_arch_vcpu_setup(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bbde6f658bfd..e4a4469903069 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9181,11 +9181,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vcpu_create(vcpu); -} + int ret; + + ret = kvm_x86_ops->vcpu_create(vcpu); + if (ret) + return ret; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); @@ -9196,6 +9197,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct msr_data msr; From 5259878432098ffd26cef7294b0a85ab5cfaf556 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:19 -0800 Subject: [PATCH 127/204] KVM: MIPS: Move .vcpu_setup() call to kvm_arch_vcpu_create() Fold setup() into create() now that the two are called back-to-back by common KVM code. This paves the way for removing kvm_arch_vcpu_setup(). Note, there is no unwind function associated with kvm_arch_vcpu_setup(), i.e. no teardown path that also needs to be moved. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 92c9321b3f959..b3a4435af66b1 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -386,8 +386,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.last_sched_cpu = -1; vcpu->arch.last_exec_cpu = -1; + /* Initial guest state */ + err = kvm_mips_callbacks->vcpu_setup(vcpu); + if (err) + goto out_free_commpage; + return 0; +out_free_commpage: + kfree(vcpu->arch.kseg0_commpage); out_free_gebase: kfree(gebase); out: @@ -1237,10 +1244,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -/* Initial guest state */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - return kvm_mips_callbacks->vcpu_setup(vcpu); + return 0; } static void kvm_mips_set_c0_status(void) From ff72bb55cbfd060172cfbafafe4838ce92ab080f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:20 -0800 Subject: [PATCH 128/204] KVM: s390: Manually invoke vcpu setup during kvm_arch_vcpu_create() Rename kvm_arch_vcpu_setup() to kvm_s390_vcpu_setup() and manually call the new function during kvm_arch_vcpu_create(). Define an empty kvm_arch_vcpu_setup() as it's still required for compilation. This is effectively a nop as kvm_arch_vcpu_create() and kvm_arch_vcpu_setup() are called back-to-back by common KVM code. Obsoleting kvm_arch_vcpu_setup() paves the way for its removal. Note, gmap_remove() is now called if setup fails, as s390 was previously freeing it via kvm_arch_vcpu_destroy(), which is called by common KVM code if kvm_arch_vcpu_setup() fails. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9cba1e5d033bd..dca3d6aac2bb4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2932,6 +2932,11 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; @@ -3070,8 +3075,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + rc = kvm_s390_vcpu_setup(vcpu); + if (rc) + goto out_ucontrol_uninit; return 0; +out_ucontrol_uninit: + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_remove(vcpu->arch.gmap); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); return rc; From b3d42c9862e0e2a2d95839d91120ddf8dd8a8af6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:21 -0800 Subject: [PATCH 129/204] KVM: PPC: BookE: Setup vcpu during kvmppc_core_vcpu_create() Fold setup() into create() now that the two are called back-to-back by common KVM code. This paves the way for removing kvm_arch_vcpu_setup(). Note, BookE directly implements kvm_arch_vcpu_setup() and PPC's common kvm_arch_vcpu_create() is responsible for its own cleanup, thus the only cleanup required when directly invoking kvmppc_core_vcpu_setup() is to call .vcpu_free(), which is the BookE specific portion of PPC's kvm_arch_vcpu_destroy() by way of kvmppc_core_vcpu_free(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/booke.c | 60 ++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d41765157f0e2..e291c62187fe3 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1377,34 +1377,9 @@ static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) update_timer_ints(vcpu); } -/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int i; - int r; - - vcpu->arch.regs.nip = 0; - vcpu->arch.shared->pir = vcpu->vcpu_id; - kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ - kvmppc_set_msr(vcpu, 0); - -#ifndef CONFIG_KVM_BOOKE_HV - vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; - vcpu->arch.shadow_pid = 1; - vcpu->arch.shared->msr = 0; -#endif - - /* Eye-catching numbers so we know if the guest takes an interrupt - * before it's programmed its own IVPR/IVORs. */ - vcpu->arch.ivpr = 0x55550000; - for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) - vcpu->arch.ivor[i] = 0x7700 | i * 4; - - kvmppc_init_timing_stats(vcpu); - - r = kvmppc_core_vcpu_setup(vcpu); - kvmppc_sanity_check(vcpu); - return r; + return 0; } int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) @@ -2116,7 +2091,38 @@ int kvmppc_core_init_vm(struct kvm *kvm) int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); + int i; + int r; + + r = vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); + if (r) + return r; + + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ + vcpu->arch.regs.nip = 0; + vcpu->arch.shared->pir = vcpu->vcpu_id; + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); + +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif + + /* Eye-catching numbers so we know if the guest takes an interrupt + * before it's programmed its own IVPR/IVORs. */ + vcpu->arch.ivpr = 0x55550000; + for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) + vcpu->arch.ivor[i] = 0x7700 | i * 4; + + kvmppc_init_timing_stats(vcpu); + + r = kvmppc_core_vcpu_setup(vcpu); + if (r) + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); + kvmppc_sanity_check(vcpu); + return r; } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) From afede96df55e9cba948c8cc8a682e962244285b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:22 -0800 Subject: [PATCH 130/204] KVM: Drop kvm_arch_vcpu_setup() Remove kvm_arch_vcpu_setup() now that all arch specific implementations are nops. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm/kvm/guest.c | 5 ----- arch/arm64/kvm/guest.c | 5 ----- arch/mips/kvm/mips.c | 5 ----- arch/powerpc/kvm/book3s.c | 5 ----- arch/powerpc/kvm/booke.c | 5 ----- arch/s390/kvm/kvm-s390.c | 5 ----- arch/x86/kvm/x86.c | 5 ----- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 5 ----- 9 files changed, 41 deletions(-) diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 0e6f23504c26b..9f7ae0d8690f9 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -34,11 +34,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2fff06114a8f5..2bd92301d32f4 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -47,11 +47,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static bool core_reg_offset_is_vreg(u64 off) { return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b3a4435af66b1..06366e2415a6a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1244,11 +1244,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void kvm_mips_set_c0_status(void) { u32 status = read_c0_status(); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3f7adcb0ff63c..d07a8e12fa156 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -471,11 +471,6 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvmppc_load_last_inst); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { return 0; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e291c62187fe3..9cb8257b41187 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1377,11 +1377,6 @@ static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) update_timer_ints(vcpu); } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { /* setup watchdog timer once */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dca3d6aac2bb4..a1bb47c7ba1e3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2931,11 +2931,6 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4a4469903069..827d5fcba7a5c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9197,11 +9197,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct msr_data msr; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 521f17cd2b260..87ca40f62b06a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -873,7 +873,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1ddb6d4cfbfd0..8e9d24442d207 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2753,10 +2753,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_uninit; - r = kvm_arch_vcpu_setup(vcpu); - if (r) - goto vcpu_destroy; - kvm_create_vcpu_debugfs(vcpu); mutex_lock(&kvm->lock); @@ -2792,7 +2788,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) unlock_vcpu_destroy: mutex_unlock(&kvm->lock); debugfs_remove_recursive(vcpu->debugfs_dentry); -vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); vcpu_uninit: kvm_vcpu_uninit(vcpu); From 95a0d01eef7a1b97358c25d335c4a28f91345cf9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:23 -0800 Subject: [PATCH 131/204] KVM: x86: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). This paves the way for removing kvm_arch_vcpu_init() entirely. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 198 +++++++++++++++++++++++---------------------- 1 file changed, 100 insertions(+), 98 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 827d5fcba7a5c..4469617adfd07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9181,11 +9181,78 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - int ret; + struct page *page; + int r; - ret = kvm_x86_ops->vcpu_create(vcpu); - if (ret) - return ret; + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + + kvm_set_tsc_khz(vcpu, max_tsc_khz); + + r = kvm_mmu_create(vcpu); + if (r < 0) + return r; + + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + if (r < 0) + goto fail_mmu_destroy; + } else + static_key_slow_inc(&kvm_no_apic_vcpu); + + r = -ENOMEM; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free_lapic; + vcpu->arch.pio_data = page_address(page); + + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks) + goto fail_free_pio_data; + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) + goto fail_free_mce_banks; + + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { + pr_err("kvm: failed to allocate userspace's fpu\n"); + goto free_wbinvd_dirty_mask; + } + + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); + goto free_user_fpu; + } + fx_init(vcpu); + + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + + kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); + + vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; + + kvm_hv_vcpu_init(vcpu); + + r = kvm_x86_ops->vcpu_create(vcpu); + if (r) + goto free_guest_fpu; vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; @@ -9195,6 +9262,22 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_init_mmu(vcpu, false); vcpu_put(vcpu); return 0; + +free_guest_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); + return r; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -9227,6 +9310,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + int idx; + kvmclock_reset(vcpu); kvm_x86_ops->vcpu_free(vcpu); @@ -9234,6 +9319,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + + kvm_hv_vcpu_uninit(vcpu); + kvm_pmu_destroy(vcpu); + kfree(vcpu->arch.mce_banks); + kvm_free_lapic(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_mmu_destroy(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + free_page((unsigned long)vcpu->arch.pio_data); + if (!lapic_in_kernel(vcpu)) + static_key_slow_dec(&kvm_no_apic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9481,106 +9577,12 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - struct page *page; - int r; - - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - kvm_set_tsc_khz(vcpu, max_tsc_khz); - - r = kvm_mmu_create(vcpu); - if (r < 0) - return r; - - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); - if (r < 0) - goto fail_mmu_destroy; - } else - static_key_slow_inc(&kvm_no_apic_vcpu); - - r = -ENOMEM; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto fail_free_lapic; - vcpu->arch.pio_data = page_address(page); - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) - goto fail_free_pio_data; - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, - GFP_KERNEL_ACCOUNT)) - goto fail_free_mce_banks; - - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_wbinvd_dirty_mask; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; - } - fx_init(vcpu); - - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); - - vcpu->arch.pending_external_vector = -1; - vcpu->arch.preempted_in_kernel = false; - - kvm_hv_vcpu_init(vcpu); - return 0; - -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -free_wbinvd_dirty_mask: - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); - return r; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - int idx; - kvm_hv_vcpu_uninit(vcpu); - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); - kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); - if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); } void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) From d11dfed5d700b8973d5742300e04b2aaa9d11217 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:24 -0800 Subject: [PATCH 132/204] KVM: MIPS: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). Rinse and repeat for kvm_arch_vcpu_uninit() and kvm_arch_vcpu_destroy(). This paves the way for removing kvm_arch_vcpu_{un}init() entirely. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 06366e2415a6a..879a7cbd5b54c 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -294,6 +294,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_debug("kvm @ %p: create cpu %d at %p\n", vcpu->kvm, vcpu->vcpu_id, vcpu); + err = kvm_mips_callbacks->vcpu_init(vcpu); + if (err) + return err; + + hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; + /* * Allocate space for host mode exception handlers that handle * guest mode exits @@ -307,7 +315,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!gebase) { err = -ENOMEM; - goto out; + goto out_uninit_vcpu; } kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", ALIGN(size, PAGE_SIZE), gebase); @@ -397,7 +405,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kfree(vcpu->arch.kseg0_commpage); out_free_gebase: kfree(gebase); -out: +out_uninit_vcpu: + kvm_mips_callbacks->vcpu_uninit(vcpu); return err; } @@ -410,6 +419,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); + + kvm_mips_callbacks->vcpu_uninit(vcpu); } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, @@ -1221,21 +1232,12 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int err; - - err = kvm_mips_callbacks->vcpu_init(vcpu); - if (err) - return err; - - hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; return 0; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - kvm_mips_callbacks->vcpu_uninit(vcpu); + } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, From 39a93a8794719a37a37140460865c4edaff795a6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:25 -0800 Subject: [PATCH 133/204] KVM: ARM: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). This paves the way for removing kvm_arch_vcpu_{un}init() entirely. Note, there is no associated unwinding in kvm_arch_vcpu_uninit() that needs to be relocated (to kvm_arch_vcpu_destroy()). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/arm/arm.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index a7d661fc56837..94616725b97e7 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -292,6 +292,25 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { + int err; + + /* Force users to call KVM_ARM_VCPU_INIT */ + vcpu->arch.target = -1; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + + /* Set up the timer */ + kvm_timer_vcpu_init(vcpu); + + kvm_pmu_vcpu_init(vcpu); + + kvm_arm_reset_debug_ptr(vcpu); + + kvm_arm_pvtime_vcpu_init(&vcpu->arch); + + err = kvm_vgic_vcpu_init(vcpu); + if (err) + return err; + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); } @@ -341,20 +360,7 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - /* Force users to call KVM_ARM_VCPU_INIT */ - vcpu->arch.target = -1; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - - /* Set up the timer */ - kvm_timer_vcpu_init(vcpu); - - kvm_pmu_vcpu_init(vcpu); - - kvm_arm_reset_debug_ptr(vcpu); - - kvm_arm_pvtime_vcpu_init(&vcpu->arch); - - return kvm_vgic_vcpu_init(vcpu); + return 0; } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) From 74ce2e60d4874fc2464e321af1397c6fae984ec9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:26 -0800 Subject: [PATCH 134/204] KVM: PPC: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). Rinse and repeat for kvm_arch_vcpu_uninit() and kvm_arch_vcpu_destroy(). This paves the way for removing kvm_arch_vcpu_{un}init() entirely. Note, calling kvmppc_mmu_destroy() if kvmppc_core_vcpu_create() fails may or may not be necessary. Move it along with the more obvious call to kvmppc_subarch_vcpu_uninit() so as not to inadvertantly introduce a functional change and/or bug. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 56 ++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index fce1b4776e55f..4fbf8690b8c5e 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -725,17 +725,43 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } +static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); + kvmppc_decrementer_func(vcpu); + + return HRTIMER_NORESTART; +} + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; - err = kvmppc_core_vcpu_create(vcpu); + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + vcpu->arch.dec_expires = get_tb(); + +#ifdef CONFIG_KVM_EXIT_TIMING + mutex_init(&vcpu->arch.exit_timing_lock); +#endif + err = kvmppc_subarch_vcpu_init(vcpu); if (err) return err; + err = kvmppc_core_vcpu_create(vcpu); + if (err) + goto out_vcpu_uninit; + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); return 0; + +out_vcpu_uninit: + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); + return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -765,6 +791,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); + + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) @@ -772,35 +801,14 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); - kvmppc_decrementer_func(vcpu); - - return HRTIMER_NORESTART; -} - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int ret; - - hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; - vcpu->arch.dec_expires = get_tb(); - -#ifdef CONFIG_KVM_EXIT_TIMING - mutex_init(&vcpu->arch.exit_timing_lock); -#endif - ret = kvmppc_subarch_vcpu_init(vcpu); - return ret; + return 0; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - kvmppc_mmu_destroy(vcpu); - kvmppc_subarch_vcpu_uninit(vcpu); + } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) From 19bcc89eb8a9fa1d4be4bff5b5e7917cb8bbc1f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:27 -0800 Subject: [PATCH 135/204] KVM: arm64: Free sve_state via arm specific hook Add an arm specific hook to free the arm64-only sve_state. Doing so eliminates the last functional code from kvm_arch_vcpu_uninit() across all architectures and paves the way for removing kvm_arch_vcpu_init() and kvm_arch_vcpu_uninit() entirely. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/reset.c | 5 +++++ virt/kvm/arm/arm.c | 2 ++ 4 files changed, 9 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 556cd818eccf3..de81dd897a30c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -366,6 +366,7 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +static inline void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c61260cf63c58..6402b2de1844d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,6 +53,7 @@ int kvm_arm_init_sve(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); +void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f4a8ae9188275..ff3512a0ca979 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -205,6 +205,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + +} + +void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { kfree(vcpu->arch.sve_state); } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 94616725b97e7..937b4c7fb5be9 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -326,6 +326,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); + + kvm_arm_vcpu_destroy(vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) From ddd259c9aaba08244dba8877687ee856f79c4f45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:28 -0800 Subject: [PATCH 136/204] KVM: Drop kvm_arch_vcpu_init() and kvm_arch_vcpu_uninit() Remove kvm_arch_vcpu_init() and kvm_arch_vcpu_uninit() now that all arch specific implementations are nops. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 1 - arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/reset.c | 5 ----- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/kvm/powerpc.c | 10 ---------- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/kvm-s390.c | 5 ----- arch/x86/kvm/x86.c | 10 ---------- include/linux/kvm_host.h | 3 --- virt/kvm/arm/arm.c | 5 ----- virt/kvm/kvm_main.c | 16 ++-------------- 11 files changed, 2 insertions(+), 65 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index de81dd897a30c..e26cad6d11b33 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -363,7 +363,6 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); static inline bool kvm_arch_requires_vhe(void) { return false; } static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6402b2de1844d..8ab62944e5148 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -54,7 +54,6 @@ int kvm_arm_init_sve(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index ff3512a0ca979..30b7ea680f66c 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -204,11 +204,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) return true; } -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { kfree(vcpu->arch.sve_state); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 879a7cbd5b54c..2606f3f02b542 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1230,16 +1230,6 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) return kvm_mips_count_timeout(vcpu); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4fbf8690b8c5e..1af96fb5dc6fb 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -801,16 +801,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_BOOKE diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 02f4c21c57f6a..11ecc4071a29f 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -914,7 +914,6 @@ extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a1bb47c7ba1e3..8646c99217f28 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2700,11 +2700,6 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4469617adfd07..985066e1bda53 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9575,16 +9575,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87ca40f62b06a..a654cf6df0789 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -864,9 +864,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); int kvm_arch_init(void *opaque); void kvm_arch_exit(void); -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 937b4c7fb5be9..1cfc108eca1ef 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -360,11 +360,6 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) preempt_enable(); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { int *last_ran; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8e9d24442d207..6b496038cd7fa 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -324,7 +324,6 @@ void kvm_reload_remote_mmus(struct kvm *kvm) static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; - int r; mutex_init(&vcpu->mutex); vcpu->cpu = -1; @@ -338,10 +337,8 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } + if (!page) + return -ENOMEM; vcpu->run = page_address(page); kvm_vcpu_set_in_spin_loop(vcpu, false); @@ -350,15 +347,7 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - r = kvm_arch_vcpu_init(vcpu); - if (r < 0) - goto fail_free_run; return 0; - -fail_free_run: - free_page((unsigned long)vcpu->run); -fail: - return r; } static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) @@ -369,7 +358,6 @@ static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) * descriptors are already gone. */ put_pid(rcu_dereference_protected(vcpu->pid, 1)); - kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } From 9941d224fb7c765bc20f3cb6c747786adc5cd002 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:29 -0800 Subject: [PATCH 137/204] KVM: Move putting of vcpu->pid to kvm_vcpu_destroy() Move the putting of vcpu->pid to kvm_vcpu_destroy(). vcpu->pid is guaranteed to be NULL when kvm_vcpu_uninit() is called in the error path of kvm_vm_ioctl_create_vcpu(), e.g. it is explicitly nullified by kvm_vcpu_init() and is only changed by KVM_RUN. No functional change intended. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6b496038cd7fa..483b6b8e70a1d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -352,12 +352,6 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { - /* - * no need for rcu_read_lock as VCPU_RUN is the only place that - * will change the vcpu->pid pointer and on uninit all file - * descriptors are already gone. - */ - put_pid(rcu_dereference_protected(vcpu->pid, 1)); free_page((unsigned long)vcpu->run); } @@ -365,6 +359,13 @@ void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); + /* + * No need for rcu_read_lock as VCPU_RUN is the only place that changes + * the vcpu->pid pointer, and at destruction time all file descriptors + * are already gone. + */ + put_pid(rcu_dereference_protected(vcpu->pid, 1)); + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } From 8bd826d629d6ff4a4e647079db15d64d06346004 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:30 -0800 Subject: [PATCH 138/204] KVM: Move vcpu->run page allocation out of kvm_vcpu_init() Open code the allocation and freeing of the vcpu->run page in kvm_vm_ioctl_create_vcpu() and kvm_vcpu_destroy() respectively. Doing so allows kvm_vcpu_init() to be a pure init function and eliminates kvm_vcpu_uninit() entirely. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 483b6b8e70a1d..d21cf86176f0b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -321,10 +321,8 @@ void kvm_reload_remote_mmus(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } -static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { - struct page *page; - mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; @@ -336,23 +334,11 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->pre_pcpu = -1; INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - return -ENOMEM; - vcpu->run = page_address(page); - kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - - return 0; -} - -static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - free_page((unsigned long)vcpu->run); } void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -366,7 +352,7 @@ void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) */ put_pid(rcu_dereference_protected(vcpu->pid, 1)); - kvm_vcpu_uninit(vcpu); + free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); @@ -2711,6 +2697,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; struct kvm_vcpu *vcpu; + struct page *page; if (id >= KVM_MAX_VCPU_ID) return -EINVAL; @@ -2734,13 +2721,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_decrement; } - r = kvm_vcpu_init(vcpu, kvm, id); - if (r) + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; goto vcpu_free; + } + vcpu->run = page_address(page); + + kvm_vcpu_init(vcpu, kvm, id); r = kvm_arch_vcpu_create(vcpu); if (r) - goto vcpu_uninit; + goto vcpu_free_run_page; kvm_create_vcpu_debugfs(vcpu); @@ -2778,8 +2770,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) mutex_unlock(&kvm->lock); debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_arch_vcpu_destroy(vcpu); -vcpu_uninit: - kvm_vcpu_uninit(vcpu); +vcpu_free_run_page: + free_page((unsigned long)vcpu->run); vcpu_free: kmem_cache_free(kvm_vcpu_cache, vcpu); vcpu_decrement: From a47970ed74a535b1accb4bc73643fd5a93993c3e Mon Sep 17 00:00:00 2001 From: John Allen Date: Thu, 19 Dec 2019 14:17:59 -0600 Subject: [PATCH 139/204] kvm/svm: PKU not currently supported Current SVM implementation does not have support for handling PKU. Guests running on a host with future AMD cpus that support the feature will read garbage from the PKRU register and will hit segmentation faults on boot as memory is getting marked as protected that should not be. Ensure that cpuid from SVM does not advertise the feature. Signed-off-by: John Allen Cc: stable@vger.kernel.org Fixes: 0556cbdc2fbc ("x86/pkeys: Don't check if PKRU is zero before writing it") Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 4 +++- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx/capabilities.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 1 + 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fff9ed6956b56..49751cbd6e638 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1157,6 +1157,7 @@ struct kvm_x86_ops { bool (*xsaves_supported)(void); bool (*umip_emulated)(void); bool (*pt_supported)(void); + bool (*pku_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 74a4d9b4e61f5..b1c469446b072 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -353,6 +353,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; unsigned f_la57; + unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0; /* cpuid 7.0.ebx */ const u32 kvm_cpuid_7_0_ebx_x86_features = @@ -364,7 +365,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | + F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -393,6 +394,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* Set LA57 based on hardware capability. */ entry->ecx |= f_la57; entry->ecx |= f_umip; + entry->ecx |= f_pku; /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 83257a7a2e376..9dbb990c319a7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6008,6 +6008,11 @@ static bool svm_has_wbinvd_exit(void) return true; } +static bool svm_pku_supported(void) +{ + return false; +} + #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -7351,6 +7356,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, .pt_supported = svm_pt_supported, + .pku_supported = svm_pku_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 7aa69716d5160..283bdb7071af6 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -145,6 +145,11 @@ static inline bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool vmx_pku_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PKU); +} + static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2134726b04429..5415cd40678c3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7849,6 +7849,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, .pt_supported = vmx_pt_supported, + .pku_supported = vmx_pku_supported, .request_immediate_exit = vmx_request_immediate_exit, From 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:41 -0800 Subject: [PATCH 140/204] KVM: x86: Protect x86_decode_insn from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in x86_decode_insn(). kvm_emulate_instruction() (an ancestor of x86_decode_insn()) is an exported symbol, so KVM should treat it conservatively from a security perspective. Fixes: 045a282ca415 ("KVM: emulator: implement fninit, fnstsw, fnstcw") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e9833e345a5c5..2d4faefe8dd43 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5288,10 +5288,15 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) } break; case Escape: - if (ctxt->modrm > 0xbf) - opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; - else + if (ctxt->modrm > 0xbf) { + size_t size = ARRAY_SIZE(opcode.u.esc->high); + u32 index = array_index_nospec( + ctxt->modrm - 0xc0, size); + + opcode = opcode.u.esc->high[index]; + } else { opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + } break; case InstrDual: if ((ctxt->modrm >> 6) == 3) From 8618793750071d66028584a83ed0b4fa7eb4f607 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:42 -0800 Subject: [PATCH 141/204] KVM: x86: Protect kvm_hv_msr_[get|set]_crash_data() from Spectre-v1/L1TF attacks This fixes Spectre-v1/L1TF vulnerabilities in kvm_hv_msr_get_crash_data() and kvm_hv_msr_set_crash_data(). These functions contain index computations that use the (attacker-controlled) MSR number. Fixes: e7d9513b60e8 ("kvm/x86: added hyper-v crash msrs into kvm hyperv context") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b255b9e865e51..4df1c965bf1a9 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -810,11 +810,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 *pdata) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - *pdata = hv->hv_crash_param[index]; + *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } @@ -853,11 +854,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 data) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - hv->hv_crash_param[index] = data; + hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } From 14e32321f3606e4b0970200b6e5e47ee6f1e6410 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:43 -0800 Subject: [PATCH 142/204] KVM: x86: Refactor picdev_write() to prevent Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in picdev_write(). It replaces index computations based on the (attacked-controlled) port number with constants through a minor refactoring. Fixes: 85f455f7ddbe ("KVM: Add support for in-kernel PIC emulation") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8259.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8b38bb4868a65..629a09ca9860d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *s, switch (addr) { case 0x20: case 0x21: + pic_lock(s); + pic_ioport_write(&s->pics[0], addr, data); + pic_unlock(s); + break; case 0xa0: case 0xa1: pic_lock(s); - pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_ioport_write(&s->pics[1], addr, data); pic_unlock(s); break; case 0x4d0: From 8c86405f606ca8508b8d9280680166ca26723695 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:44 -0800 Subject: [PATCH 143/204] KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in ioapic_read_indirect(). This function contains index computations based on the (attacker-controlled) IOREGSEL register. Fixes: a2c118bfab8b ("KVM: Fix bounds checking in ioapic indirect register reads (CVE-2013-1798)") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 7312aab33298e..c5776febb517c 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -68,13 +69,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, default: { u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; - u64 redir_content; + u64 redir_content = ~0ULL; - if (redir_index < IOAPIC_NUM_PINS) - redir_content = - ioapic->redirtbl[redir_index].bits; - else - redir_content = ~0ULL; + if (redir_index < IOAPIC_NUM_PINS) { + u32 index = array_index_nospec( + redir_index, IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[index].bits; + } result = (ioapic->ioregsel & 0x1) ? (redir_content >> 32) & 0xffffffff : From 670564559ca35b439c8d8861fc399451ddf95137 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:45 -0800 Subject: [PATCH 144/204] KVM: x86: Protect ioapic_write_indirect() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in ioapic_write_indirect(). This function contains index computations based on the (attacker-controlled) IOREGSEL register. This patch depends on patch "KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks". Fixes: 70f93dae32ac ("KVM: Use temporary variable to shorten lines.") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index c5776febb517c..26aa22cb9b29d 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -297,6 +297,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (index >= IOAPIC_NUM_PINS) return; + index = array_index_nospec(index, IOAPIC_NUM_PINS); e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; /* Preserve read-only fields */ From 4bf79cb089f6b1c6c632492c0271054ce52ad766 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:46 -0800 Subject: [PATCH 145/204] KVM: x86: Protect kvm_lapic_reg_write() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in kvm_lapic_reg_write(). This function contains index computations based on the (attacker-controlled) MSR number. Fixes: 0105d1a52640 ("KVM: x2apic interface to lapic") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 88c3c0c6d1e34..865edce27a6aa 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1960,15 +1960,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: + case APIC_LVTERR: { /* TODO: Check vector */ + size_t size; + u32 index; + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + size = ARRAY_SIZE(apic_lvt_mask); + index = array_index_nospec( + (reg - APIC_LVTT) >> 4, size); + val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); - break; + } case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) From 25a5edea71b7c154b6a0b8cec14c711cafa31d26 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:47 -0800 Subject: [PATCH 146/204] KVM: x86: Protect MSR-based index computations in fixed_msr_to_seg_unit() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in fixed_msr_to_seg_unit(). This function contains index computations based on the (attacker-controlled) MSR number. Fixes: de9aef5e1ad6 ("KVM: MTRR: introduce fixed_mtrr_segment table") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 25ce3edd1872c..7f0059aa30e16 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) break; case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: *seg = 1; - *unit = msr - MSR_MTRRfix16K_80000; + *unit = array_index_nospec( + msr - MSR_MTRRfix16K_80000, + MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); break; case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: *seg = 2; - *unit = msr - MSR_MTRRfix4K_C0000; + *unit = array_index_nospec( + msr - MSR_MTRRfix4K_C0000, + MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); break; default: return false; From 13c5183a4e643cc2b03a22d0e582c8e17bb7457d Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:48 -0800 Subject: [PATCH 147/204] KVM: x86: Protect MSR-based index computations in pmu.h from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in the get_gp_pmc() and get_fixed_pmc() functions. They both contain index computations based on the (attacker-controlled) MSR number. Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7ebb62326c142..13332984b6d59 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -2,6 +2,8 @@ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H +#include + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -102,8 +104,12 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) { - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_gp_counters); + + return &pmu->gp_counters[index]; + } return NULL; } @@ -113,8 +119,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) { int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_fixed_counters); + + return &pmu->fixed_counters[index]; + } return NULL; } From 6ec4c5eee1750d5d17951c4e1960d953376a0dda Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:49 -0800 Subject: [PATCH 148/204] KVM: x86: Protect MSR-based index computations from Spectre-v1/L1TF attacks in x86.c This fixes a Spectre-v1/L1TF vulnerability in set_msr_mce() and get_msr_mce(). Both functions contain index computations based on the (attacker-controlled) MSR number. Fixes: 890ca9aefa78 ("KVM: Add MCE support") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 985066e1bda53..913e55f6dca35 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2545,7 +2545,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore @@ -2986,7 +2989,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; break; } From 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:50 -0800 Subject: [PATCH 149/204] KVM: x86: Refactor prefix decoding to prevent Spectre-v1/L1TF attacks This fixes Spectre-v1/L1TF vulnerabilities in vmx_read_guest_seg_selector(), vmx_read_guest_seg_base(), vmx_read_guest_seg_limit() and vmx_read_guest_seg_ar(). When invoked from emulation, these functions contain index computations based on the (attacker-influenced) segment value. Using constants prevents the attack. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2d4faefe8dd43..20c0cbdff1be6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5195,16 +5195,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->ad_bytes = def_ad_bytes ^ 6; break; case 0x26: /* ES override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_ES; + break; case 0x2e: /* CS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_CS; + break; case 0x36: /* SS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_SS; + break; case 0x3e: /* DS override */ has_seg_override = true; - ctxt->seg_override = (ctxt->b >> 3) & 3; + ctxt->seg_override = VCPU_SREG_DS; break; case 0x64: /* FS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_FS; + break; case 0x65: /* GS override */ has_seg_override = true; - ctxt->seg_override = ctxt->b & 7; + ctxt->seg_override = VCPU_SREG_GS; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) From c926f2f7230b1a29e31914b51db680f8cbf3103f Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:51 -0800 Subject: [PATCH 150/204] KVM: x86: Protect exit_reason from being used in Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in vmx_handle_exit(). While exit_reason is set by the hardware and therefore should not be attacker-influenced, an unknown exit_reason could potentially be used to perform such an attack. Fixes: 55d2375e58a6 ("KVM: nVMX: Move nested code to dedicated files") Signed-off-by: Marios Pomonis Signed-off-by: Nick Finco Suggested-by: Sean Christopherson Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 57 ++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5415cd40678c3..62fb639895c2c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5913,34 +5913,41 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { kvm_skip_emulated_instruction(vcpu); return 1; - } else if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) { + } + + if (exit_reason >= kvm_vmx_max_exit_handlers) + goto unexpected_vmexit; #ifdef CONFIG_RETPOLINE - if (exit_reason == EXIT_REASON_MSR_WRITE) - return kvm_emulate_wrmsr(vcpu); - else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) - return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) - return handle_interrupt_window(vcpu); - else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) - return handle_external_interrupt(vcpu); - else if (exit_reason == EXIT_REASON_HLT) - return kvm_emulate_halt(vcpu); - else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) - return handle_ept_misconfig(vcpu); + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); #endif - return kvm_vmx_exit_handlers[exit_reason](vcpu); - } else { - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason); - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = + + exit_reason = array_index_nospec(exit_reason, + kvm_vmx_max_exit_handlers); + if (!kvm_vmx_exit_handlers[exit_reason]) + goto unexpected_vmexit; + + return kvm_vmx_exit_handlers[exit_reason](vcpu); + +unexpected_vmexit: + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; - vcpu->run->internal.data[0] = exit_reason; - return 0; - } + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_reason; + return 0; } /* From ea740059ecb37807ba47b84b33d1447435a8d868 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:52 -0800 Subject: [PATCH 151/204] KVM: x86: Protect DR-based index computations from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in __kvm_set_dr() and kvm_get_dr(). Both kvm_get_dr() and kvm_set_dr() (a wrapper of __kvm_set_dr()) are exported symbols so KVM should tream them conservatively from a security perspective. Fixes: 020df0794f57 ("KVM: move DR register access handling into generic code") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 913e55f6dca35..780224e767232 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1063,9 +1063,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - vcpu->arch.db[dr] = val; + vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; @@ -1102,9 +1104,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[dr]; + *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: /* fall through */ From 66061740f1a487f4ed54fde75e724709f805da53 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:53 -0800 Subject: [PATCH 152/204] KVM: x86: Protect pmu_intel.c from Spectre-v1/L1TF attacks This fixes Spectre-v1/L1TF vulnerabilities in intel_find_fixed_event() and intel_rdpmc_ecx_to_pmc(). kvm_rdpmc() (ancestor of intel_find_fixed_event()) and reprogram_fixed_counter() (ancestor of intel_rdpmc_ecx_to_pmc()) are exported symbols so KVM should treat them conservatively from a security perspective. Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7023138b1cb00..34a3a17bb6d70 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -86,10 +86,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu, static unsigned intel_find_fixed_event(int idx) { - if (idx >= ARRAY_SIZE(fixed_pmc_events)) + u32 event; + size_t size = ARRAY_SIZE(fixed_pmc_events); + + if (idx >= size) return PERF_COUNT_HW_MAX; - return intel_arch_events[fixed_pmc_events[idx]].event_type; + event = fixed_pmc_events[array_index_nospec(idx, size)]; + return intel_arch_events[event].event_type; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -130,16 +134,20 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); struct kvm_pmc *counters; + unsigned int num_counters; idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return NULL; - if (fixed && idx >= pmu->nr_arch_fixed_counters) + if (fixed) { + counters = pmu->fixed_counters; + num_counters = pmu->nr_arch_fixed_counters; + } else { + counters = pmu->gp_counters; + num_counters = pmu->nr_arch_gp_counters; + } + if (idx >= num_counters) return NULL; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; - - return &counters[idx]; + return &counters[array_index_nospec(idx, num_counters)]; } static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) From 767b839afa5d62ba9cf859f4e90fef3d4a1780b5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jan 2020 10:41:55 +0800 Subject: [PATCH 153/204] KVM: x86: avoid clearing pending exception event twice The exception pending event is cleared by kvm_clear_exception_queue(). We shouldn't clear it again. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 780224e767232..2b26400a3410b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9355,7 +9355,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.nmi_injected = false; kvm_clear_interrupt_queue(vcpu); kvm_clear_exception_queue(vcpu); - vcpu->arch.exception.pending = false; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); From 3ce4dc17e0c1e7280d53abeb85ce851a88868c63 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jan 2020 10:50:37 +0800 Subject: [PATCH 154/204] KVM: apic: short-circuit kvm_apic_accept_pic_intr() when pic intr is accepted Short-circuit kvm_apic_accept_pic_intr() when pic intr is accepted, there is no need to proceed further. Also remove unnecessary var r. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 865edce27a6aa..286396c0aa7d6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2375,14 +2375,13 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); - int r = 0; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) - r = 1; + return 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - return r; + return 1; + return 0; } void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) From 3911b65ee1d3e3fc5e3786b1f309dcad0e33f7fd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 18 Jan 2020 22:29:46 +0100 Subject: [PATCH 155/204] Revert "KVM: x86: Add a WARN on TIF_NEED_FPU_LOAD in kvm_load_guest_fpu()" This reverts commit 95145c25a78cc0a9d3cbc75708abde432310c5a1. The next few patches will fix the issue so the warning is not needed anymore; revert it separately to simplify application to stable kernels. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b26400a3410b..603a1f778dbea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8563,13 +8563,6 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - /* - * Reloading userspace's FPU is handled by kvm_arch_vcpu_load(), both - * for direct calls from userspace (via vcpu_load()) and if this task - * is preempted (via kvm_sched_in()) between vcpu_load() and now. - */ - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); - copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, From c9aef3b85f425d1f6635382ec210ee5a7ef55d7d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:49 -0800 Subject: [PATCH 156/204] KVM: x86: Handle TIF_NEED_FPU_LOAD in kvm_{load,put}_guest_fpu() Handle TIF_NEED_FPU_LOAD similar to how fpu__copy() handles the flag when duplicating FPU state to a new task struct. TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM, be it voluntarily, e.g. if I/O is triggered during a KVM call to get_user_pages, or involuntarily, e.g. if softirq runs after an IRQ occurs. Therefore, KVM must account for TIF_NEED_FPU_LOAD whenever it is (potentially) accessing CPU FPU state. Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 603a1f778dbea..4b1c9d1c37860 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8558,12 +8558,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +static void kvm_save_current_fpu(struct fpu *fpu) +{ + /* + * If the target FPU state is not resident in the CPU registers, just + * memcpy() from current, else save CPU state directly to the target. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&fpu->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + else + copy_fpregs_to_fpstate(fpu); +} + /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.user_fpu); + kvm_save_current_fpu(vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8579,7 +8593,8 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); + kvm_save_current_fpu(vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); From a7baead7e312f5a05381d68585fb6dc68e19e90f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:50 -0800 Subject: [PATCH 157/204] KVM: x86: Ensure guest's FPU state is loaded when accessing for emulation Lock the FPU regs and reload the current thread's FPU state, which holds the guest's FPU state, to the CPU registers if necessary prior to accessing guest FPU state as part of emulation. kernel_fpu_begin() can be called from softirq context, therefore KVM must ensure softirqs are disabled (locking the FPU regs disables softirqs) when touching CPU FPU state. Note, for all intents and purposes this reverts commit 6ab0b9feb82a7 ("x86,kvm: remove KVM emulator get_fpu / put_fpu"), but at the time it was applied, removing get/put_fpu() was correct. The re-introduction of {get,put}_fpu() is necessitated by the deferring of FPU state load. Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 20c0cbdff1be6..792ae95880179 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,6 +22,7 @@ #include "kvm_cache_regs.h" #include #include +#include #include #include @@ -1075,8 +1076,23 @@ static void fetch_register_operand(struct operand *op) } } +static void emulator_get_fpu(void) +{ + fpregs_lock(); + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); +} + +static void emulator_put_fpu(void) +{ + fpregs_unlock(); +} + static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1098,11 +1114,13 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) #endif default: BUG(); } + emulator_put_fpu(); } static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1124,10 +1142,12 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, #endif default: BUG(); } + emulator_put_fpu(); } static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1139,10 +1159,12 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; default: BUG(); } + emulator_put_fpu(); } static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1154,6 +1176,7 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; default: BUG(); } + emulator_put_fpu(); } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1161,7 +1184,9 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fninit"); + emulator_put_fpu(); return X86EMUL_CONTINUE; } @@ -1172,7 +1197,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fnstcw %0": "+m"(fcw)); + emulator_put_fpu(); ctxt->dst.val = fcw; @@ -1186,7 +1213,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fnstsw %0": "+m"(fsw)); + emulator_put_fpu(); ctxt->dst.val = fsw; @@ -4077,8 +4106,12 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + emulator_get_fpu(); + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + emulator_put_fpu(); + if (rc != X86EMUL_CONTINUE) return rc; @@ -4121,6 +4154,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + emulator_get_fpu(); + if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) @@ -4136,6 +4171,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: + emulator_put_fpu(); + return rc; } @@ -5450,7 +5487,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; + emulator_get_fpu(); rc = asm_safe("fwait"); + emulator_put_fpu(); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); From 2620fe268e80d667a94553cd37a94ccaa2cb8c83 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:51 -0800 Subject: [PATCH 158/204] KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest" Reload the current thread's FPU state, which contains the guest's FPU state, to the CPU registers if necessary during vcpu_enter_guest(). TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM, e.g. if I/O is triggered during a KVM call to get_user_pages() or if a softirq occurs while KVM is scheduled in. Moving the handling of TIF_NEED_FPU_LOAD from vcpu_enter_guest() to kvm_arch_vcpu_load(), effectively kvm_sched_in(), papered over a bug where kvm_put_guest_fpu() failed to account for TIF_NEED_FPU_LOAD. The easiest way to the kvm_put_guest_fpu() bug was to run with involuntary preemption enable, thus handling TIF_NEED_FPU_LOAD during kvm_sched_in() made the bug go away. But, removing the handling in vcpu_enter_guest() exposed KVM to the rare case of a softirq triggering kernel_fpu_begin() between vcpu_load() and vcpu_enter_guest(). Now that kvm_{load,put}_guest_fpu() correctly handle TIF_NEED_FPU_LOAD, revert the commit to both restore the vcpu_enter_guest() behavior and eliminate the superfluous switch_fpu_return() in kvm_arch_vcpu_load(). Note, leaving the handling in kvm_arch_vcpu_load() isn't wrong per se, but it is unnecessary, and most critically, makes it extremely difficult to find bugs such as the kvm_put_guest_fpu() issue due to shrinking the window where a softirq can corrupt state. A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while vcpu state is loaded: gcmaes_crypt_by_sg.constprop.12+0x26e/0x660 ? 0xffffffffc024547d ? __qdisc_run+0x83/0x510 ? __dev_queue_xmit+0x45e/0x990 ? ip_finish_output2+0x1a8/0x570 ? fib4_rule_action+0x61/0x70 ? fib4_rule_action+0x70/0x70 ? fib_rules_lookup+0x13f/0x1c0 ? helper_rfc4106_decrypt+0x82/0xa0 ? crypto_aead_decrypt+0x40/0x70 ? crypto_aead_decrypt+0x40/0x70 ? crypto_aead_decrypt+0x40/0x70 ? esp_output_tail+0x8f4/0xa5a [esp4] ? skb_ext_add+0xd3/0x170 ? xfrm_input+0x7a6/0x12c0 ? xfrm4_rcv_encap+0xae/0xd0 ? xfrm4_transport_finish+0x200/0x200 ? udp_queue_rcv_one_skb+0x1ba/0x460 ? udp_unicast_rcv_skb.isra.63+0x72/0x90 ? __udp4_lib_rcv+0x51b/0xb00 ? ip_protocol_deliver_rcu+0xd2/0x1c0 ? ip_local_deliver_finish+0x44/0x50 ? ip_local_deliver+0xe0/0xf0 ? ip_protocol_deliver_rcu+0x1c0/0x1c0 ? ip_rcv+0xbc/0xd0 ? ip_rcv_finish_core.isra.19+0x380/0x380 ? __netif_receive_skb_one_core+0x7e/0x90 ? netif_receive_skb_internal+0x3d/0xb0 ? napi_gro_receive+0xed/0x150 ? 0xffffffffc0243c77 ? net_rx_action+0x149/0x3b0 ? __do_softirq+0xe4/0x2f8 ? handle_irq_event_percpu+0x6a/0x80 ? irq_exit+0xe6/0xf0 ? do_IRQ+0x7f/0xd0 ? common_interrupt+0xf/0xf ? irq_entries_start+0x20/0x660 ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel] ? kvm_set_msr_common+0xfc7/0x2380 [kvm] ? recalibrate_cpu_khz+0x10/0x10 ? ktime_get+0x3a/0xa0 ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm] ? kvm_init+0x6bf/0xd00 [kvm] ? __seccomp_filter+0x7a/0x680 ? do_vfs_ioctl+0xa4/0x630 ? security_file_ioctl+0x32/0x50 ? ksys_ioctl+0x60/0x90 ? __x64_sys_ioctl+0x16/0x20 ? do_syscall_64+0x5f/0x1a0 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 9564a1ccad733a90 ]--- This reverts commit e751732486eb3f159089a64d1901992b1357e7cc. Fixes: e751732486eb3 ("KVM: X86: Fix fpu state crash in kvm guest") Reported-by: Derek Yerger Reported-by: kernel@najdan.com Cc: Wanpeng Li Cc: Thomas Lambertz Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Borislav Petkov Cc: Dave Hansen Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b1c9d1c37860..ec8f05defd549 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3528,10 +3528,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -8285,8 +8281,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); guest_enter_irqoff(); - /* The preempt notifier should have taken care of the FPU already. */ - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); From c0a21c3f9d9b21a4bd67a46d96eaeb372b0daf20 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:52 -0800 Subject: [PATCH 159/204] KVM: x86: Remove unused ctxt param from emulator's FPU accessors Remove an unused struct x86_emulate_ctxt * param from low level helpers used to access guest FPU state. The unused param was left behind by commit 6ab0b9feb82a ("x86,kvm: remove KVM emulator get_fpu / put_fpu"). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 792ae95880179..c7a0da45f60ad 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1090,7 +1090,7 @@ static void emulator_put_fpu(void) fpregs_unlock(); } -static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +static void read_sse_reg(sse128_t *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1117,8 +1117,7 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) emulator_put_fpu(); } -static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, - int reg) +static void write_sse_reg(sse128_t *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1145,7 +1144,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, emulator_put_fpu(); } -static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +static void read_mmx_reg(u64 *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1162,7 +1161,7 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) emulator_put_fpu(); } -static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +static void write_mmx_reg(u64 *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1234,7 +1233,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; - read_sse_reg(ctxt, &op->vec_val, reg); + read_sse_reg(&op->vec_val, reg); return; } if (ctxt->d & Mmx) { @@ -1285,7 +1284,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = ctxt->modrm_rm; - read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); + read_sse_reg(&op->vec_val, ctxt->modrm_rm); return rc; } if (ctxt->d & Mmx) { @@ -1862,10 +1861,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->bytes * op->count); break; case OP_XMM: - write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); + write_sse_reg(&op->vec_val, op->addr.xmm); break; case OP_MM: - write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); + write_mmx_reg(&op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -5497,11 +5496,10 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op) +static void fetch_possible_mmx_operand(struct operand *op) { if (op->type == OP_MM) - read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); + read_mmx_reg(&op->mm_val, op->addr.mm); } static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) @@ -5580,10 +5578,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) * Now that we know the fpu is exception safe, we can fetch * operands from it. */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); + fetch_possible_mmx_operand(&ctxt->src); + fetch_possible_mmx_operand(&ctxt->src2); if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); + fetch_possible_mmx_operand(&ctxt->dst); } if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { From d196842150e046a523f75785438137a9243c9627 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 13 Dec 2019 16:33:58 -0800 Subject: [PATCH 160/204] KVM: nVMX: WARN on failure to set IA32_PERF_GLOBAL_CTRL Writes to MSR_CORE_PERF_GLOBAL_CONTROL should never fail if the VM-exit and VM-entry controls are exposed to L1. Promote the checks to perform a full WARN if kvm_set_msr() fails and remove the now unused macro SET_MSR_OR_WARN(). Suggested-by: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Oliver Upton Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ef2d53854d158..2f2d49992d037 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -28,16 +28,6 @@ module_param(nested_early_check, bool, S_IRUGO); failed; \ }) -#define SET_MSR_OR_WARN(vcpu, idx, data) \ -({ \ - bool failed = kvm_set_msr(vcpu, idx, data); \ - if (failed) \ - pr_warn_ratelimited( \ - "%s cannot write MSR (0x%x, 0x%llx)\n", \ - __func__, idx, data); \ - failed; \ -}) - /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -2550,8 +2540,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl)) + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; kvm_rsp_write(vcpu, vmcs12->guest_rsp); @@ -3996,8 +3986,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ From de761ea792c8344b7170624dbbcba4d730e23897 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jan 2020 10:36:05 -0800 Subject: [PATCH 161/204] KVM: x86: Perform non-canonical checks in 32-bit KVM Remove the CONFIG_X86_64 condition from the low level non-canonical helpers to effectively enable non-canonical checks on 32-bit KVM. Non-canonical checks are performed by hardware if the CPU *supports* 64-bit mode, whether or not the CPU is actually in 64-bit mode is irrelevant. For the most part, skipping non-canonical checks on 32-bit KVM is ok-ish because 32-bit KVM always (hopefully) drops bits 63:32 of whatever value it's checking before propagating it to hardware, and architecturally, the expected behavior for the guest is a bit of a grey area since the vCPU itself doesn't support 64-bit mode. I.e. a 32-bit KVM guest can observe the missed checks in several paths, e.g. INVVPID and VM-Enter, but it's debatable whether or not the missed checks constitute a bug because technically the vCPU doesn't support 64-bit mode. The primary motivation for enabling the non-canonical checks is defense in depth. As mentioned above, a guest can trigger a missed check via INVVPID or VM-Enter. INVVPID is straightforward as it takes a 64-bit virtual address as part of its 128-bit INVVPID descriptor and fails if the address is non-canonical, even if INVVPID is executed in 32-bit PM. Nested VM-Enter is a bit more convoluted as it requires the guest to write natural width VMCS fields via memory accesses and then VMPTRLD the VMCS, but it's still possible. In both cases, KVM is saved from a true bug only because its flows that propagate values to hardware (correctly) take "unsigned long" parameters and so drop bits 63:32 of the bad value. Explicitly performing the non-canonical checks makes it less likely that a bad value will be propagated to hardware, e.g. in the INVVPID case, if __invvpid() didn't implicitly drop bits 63:32 then KVM would BUG() on the resulting unexpected INVVPID failure due to hardware rejecting the non-canonical address. The only downside to enabling the non-canonical checks is that it adds a relatively small amount of overhead, but the affected flows are not hot paths, i.e. the overhead is negligible. Note, KVM technically could gate the non-canonical checks on 32-bit KVM with static_cpu_has(X86_FEATURE_LM), but on bare metal that's an even bigger waste of code for everyone except the 0.00000000000001% of the population running on Yonah, and nested 32-bit on 64-bit already fudges things with respect to 64-bit CPU behavior. Signed-off-by: Sean Christopherson [Also do so in nested_vmx_check_host_state as reported by Krish. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- arch/x86/kvm/x86.h | 8 -------- 2 files changed, 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2f2d49992d037..53ea65070b5a4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2813,7 +2813,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; -#ifdef CONFIG_X86_64 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || @@ -2821,7 +2820,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; -#endif /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index dd6e34d0a8811..e007b61b932a9 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -161,21 +161,13 @@ static inline u64 get_canonical(u64 la, u8 vaddr_bits) static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) { -#ifdef CONFIG_X86_64 return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; -#else - return false; -#endif } static inline bool emul_is_noncanonical_address(u64 la, struct x86_emulate_ctxt *ctxt) { -#ifdef CONFIG_X86_64 return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; -#else - return false; -#endif } static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, From de9bf4d4cea369355440c5afbbbc7e62302de8fd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 15 Jan 2020 18:10:12 +0100 Subject: [PATCH 162/204] x86/kvm/hyper-v: remove stale evmcs_already_enabled check from nested_enable_evmcs() In nested_enable_evmcs() evmcs_already_enabled check doesn't really do anything: controls are already sanitized and we return '0' regardless. Just drop the check. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 72359709cdc17..89c3e0caf39f8 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -350,17 +350,12 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { struct vcpu_vmx *vmx = to_vmx(vcpu); - bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled; vmx->nested.enlightened_vmcs_enabled = true; if (vmcs_version) *vmcs_version = nested_get_evmcs_version(vcpu); - /* We don't support disabling the feature for simplicity. */ - if (evmcs_already_enabled) - return 0; - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; From ef82eddc0e3179b4529a67ed102fe4f7efba2e65 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Jan 2020 09:57:11 -0500 Subject: [PATCH 163/204] KVM: Remove kvm_read_guest_atomic() Remove kvm_read_guest_atomic() because it's not used anywhere. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 11 ----------- 2 files changed, 13 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a654cf6df0789..83bd60f0af013 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -722,8 +722,6 @@ void kvm_get_pfn(kvm_pfn_t pfn); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d21cf86176f0b..929d7fccf7cd9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2030,17 +2030,6 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, return 0; } -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - int offset = offset_in_page(gpa); - - return __kvm_read_guest_atomic(slot, gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); - int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { From fcd97ad58f76efcd58808941e4dd2bc6c544b9de Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Jan 2020 09:57:12 -0500 Subject: [PATCH 164/204] KVM: Add build-time error check on kvm_run size It's already going to reach 2400 Bytes (which is over half of page size on 4K page archs), so maybe it's good to have this build-time check in case it overflows when adding new fields. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 929d7fccf7cd9..4f3ac8b753b68 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2710,6 +2710,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_decrement; } + BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; From 2a5755bb21ee2f6567ce3eb2af10e8948df047d6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Jan 2020 09:57:14 -0500 Subject: [PATCH 165/204] KVM: X86: Don't take srcu lock in init_rmode_identity_map() We've already got the slots_lock, so we should be safe. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 62fb639895c2c..1d486e8eb4ef0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3483,7 +3483,7 @@ static int init_rmode_tss(struct kvm *kvm) static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - int i, idx, r = 0; + int i, r = 0; kvm_pfn_t identity_map_pfn; u32 tmp; @@ -3491,7 +3491,7 @@ static int init_rmode_identity_map(struct kvm *kvm) mutex_lock(&kvm->slots_lock); if (likely(kvm_vmx->ept_identity_pagetable_done)) - goto out2; + goto out; if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; @@ -3500,9 +3500,8 @@ static int init_rmode_identity_map(struct kvm *kvm) r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (r < 0) - goto out2; + goto out; - idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -3518,9 +3517,6 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm_vmx->ept_identity_pagetable_done = true; out: - srcu_read_unlock(&kvm->srcu, idx); - -out2: mutex_unlock(&kvm->slots_lock); return r; } From 6a3c623ba8a842f895e80a7fa0feb94b7b4368f2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Jan 2020 09:57:16 -0500 Subject: [PATCH 166/204] KVM: X86: Drop x86_set_memory_region() The helper x86_set_memory_region() is only used in vmx_set_tss_addr() and kvm_arch_destroy_vm(). Push the lock upper in both cases. With that, drop x86_set_memory_region(). This prepares to allow __x86_set_memory_region() to return a HVA mapped, because the HVA will need to be protected by the lock too even after __x86_set_memory_region() returns. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx/vmx.c | 7 +++++-- arch/x86/kvm/x86.c | 22 +++++++--------------- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49751cbd6e638..69e31dbdfdc20 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1627,7 +1627,6 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); -int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1d486e8eb4ef0..5087bd7062f0a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4491,8 +4491,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (enable_unrestricted_guest) return 0; - ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, - PAGE_SIZE * 3); + mutex_lock(&kvm->slots_lock); + ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); + mutex_unlock(&kvm->slots_lock); + if (ret) return ret; to_kvm_vmx(kvm)->tss_addr = addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec8f05defd549..48cd4e191b9c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9732,18 +9732,6 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(__x86_set_memory_region); -int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) -{ - int r; - - mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, id, gpa, size); - mutex_unlock(&kvm->slots_lock); - - return r; -} -EXPORT_SYMBOL_GPL(x86_set_memory_region); - void kvm_arch_pre_destroy_vm(struct kvm *kvm) { kvm_mmu_pre_destroy_vm(kvm); @@ -9757,9 +9745,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) * unless the the memory map has changed due to process exit * or fd copying. */ - x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_lock(&kvm->slots_lock); + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_unlock(&kvm->slots_lock); } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); From 7495e22bb165e7030bae4d9c6e84addb5ea17b29 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Jan 2020 09:57:19 -0500 Subject: [PATCH 167/204] KVM: Move running VCPU from ARM to common code For ring-based dirty log tracking, it will be more efficient to account writes during schedule-out or schedule-in to the currently running VCPU. We would like to do it even if the write doesn't use the current VCPU's address space, as is the case for cached writes (see commit 4e335d9e7ddb, "Revert "KVM: Support vCPU-based gfn->hva cache"", 2017-05-02). Therefore, add a mechanism to track the currently-loaded kvm_vcpu struct. There is already something similar in KVM/ARM; one important difference is that kvm_arch_vcpu_{load,put} have two callers in virt/kvm/kvm_main.c: we have to update both the architecture-independent vcpu_{load,put} and the preempt notifiers. Another change made in the process is to allow using kvm_get_running_vcpu() in preemptible code. This is allowed because preempt notifiers ensure that the value does not change even after the VCPU thread is migrated. Signed-off-by: Paolo Bonzini Reviewed-by: Paolo Bonzini Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 2 -- arch/arm64/include/asm/kvm_host.h | 2 -- include/linux/kvm_host.h | 3 +++ virt/kvm/arm/arch_timer.c | 2 +- virt/kvm/arm/arm.c | 29 ----------------------------- virt/kvm/arm/perf.c | 6 +++--- virt/kvm/arm/vgic/vgic-mmio.c | 15 +++------------ virt/kvm/kvm_main.c | 25 ++++++++++++++++++++++++- 8 files changed, 34 insertions(+), 50 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index e26cad6d11b33..4215948923044 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -284,8 +284,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -struct kvm_vcpu *kvm_arm_get_running_vcpu(void); -struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8ab62944e5148..eb992eaa41659 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -446,8 +446,6 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -struct kvm_vcpu *kvm_arm_get_running_vcpu(void); -struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 83bd60f0af013..48e139c293c23 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1335,6 +1335,9 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +struct kvm_vcpu *kvm_get_running_vcpu(void); +struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); + #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f182b23803457..63dd6f27997c0 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -1022,7 +1022,7 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) bool kvm_arch_timer_get_input_level(int vintid) { - struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu(); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); struct arch_timer_context *timer; if (vintid == vcpu_vtimer(vcpu)->irq.irq) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 1cfc108eca1ef..3ff510599af63 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -51,9 +51,6 @@ __asm__(".arch_extension virt"); DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -/* Per-CPU variable containing the currently running vcpu. */ -static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); - /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; @@ -62,31 +59,8 @@ static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); - -static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(kvm_arm_running_vcpu, vcpu); -} - DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -/** - * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU. - * Must be called from non-preemptible context - */ -struct kvm_vcpu *kvm_arm_get_running_vcpu(void) -{ - return __this_cpu_read(kvm_arm_running_vcpu); -} - -/** - * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus. - */ -struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) -{ - return &kvm_arm_running_vcpu; -} - int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -380,7 +354,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; vcpu->arch.host_cpu_context = &cpu_data->host_ctxt; - kvm_arm_set_running_vcpu(vcpu); kvm_vgic_load(vcpu); kvm_timer_vcpu_load(vcpu); kvm_vcpu_load_sysregs(vcpu); @@ -406,8 +379,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_vcpu_pmu_restore_host(vcpu); vcpu->cpu = -1; - - kvm_arm_set_running_vcpu(NULL); } static void vcpu_power_off(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c index 918cdc3839ea5..d45b8b9a44151 100644 --- a/virt/kvm/arm/perf.c +++ b/virt/kvm/arm/perf.c @@ -13,14 +13,14 @@ static int kvm_is_in_guest(void) { - return kvm_arm_get_running_vcpu() != NULL; + return kvm_get_running_vcpu() != NULL; } static int kvm_is_user_mode(void) { struct kvm_vcpu *vcpu; - vcpu = kvm_arm_get_running_vcpu(); + vcpu = kvm_get_running_vcpu(); if (vcpu) return !vcpu_mode_priv(vcpu); @@ -32,7 +32,7 @@ static unsigned long kvm_get_guest_ip(void) { struct kvm_vcpu *vcpu; - vcpu = kvm_arm_get_running_vcpu(); + vcpu = kvm_get_running_vcpu(); if (vcpu) return *vcpu_pc(vcpu); diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 0d090482720d2..d656ebd5f9d46 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -190,15 +190,6 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, * value later will give us the same value as we update the per-CPU variable * in the preempt notifier handlers. */ -static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) -{ - struct kvm_vcpu *vcpu; - - preempt_disable(); - vcpu = kvm_arm_get_running_vcpu(); - preempt_enable(); - return vcpu; -} /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, @@ -221,7 +212,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); + bool is_uaccess = !kvm_get_running_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -274,7 +265,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); + bool is_uaccess = !kvm_get_running_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -335,7 +326,7 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active) { unsigned long flags; - struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); + struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); raw_spin_lock_irqsave(&irq->irq_lock, flags); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4f3ac8b753b68..7837fd524296d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -107,6 +107,7 @@ static atomic_t hardware_enable_failed; static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; +static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); struct dentry *kvm_debugfs_dir; EXPORT_SYMBOL_GPL(kvm_debugfs_dir); @@ -196,6 +197,8 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) void vcpu_load(struct kvm_vcpu *vcpu) { int cpu = get_cpu(); + + __this_cpu_write(kvm_running_vcpu, vcpu); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); @@ -207,6 +210,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) preempt_disable(); kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); + __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } EXPORT_SYMBOL_GPL(vcpu_put); @@ -4288,8 +4292,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) WRITE_ONCE(vcpu->preempted, false); WRITE_ONCE(vcpu->ready, false); + __this_cpu_write(kvm_running_vcpu, vcpu); kvm_arch_sched_in(vcpu, cpu); - kvm_arch_vcpu_load(vcpu, cpu); } @@ -4303,6 +4307,25 @@ static void kvm_sched_out(struct preempt_notifier *pn, WRITE_ONCE(vcpu->ready, true); } kvm_arch_vcpu_put(vcpu); + __this_cpu_write(kvm_running_vcpu, NULL); +} + +/** + * kvm_get_running_vcpu - get the vcpu running on the current CPU. + * Thanks to preempt notifiers, this can also be called from + * preemptible context. + */ +struct kvm_vcpu *kvm_get_running_vcpu(void) +{ + return __this_cpu_read(kvm_running_vcpu); +} + +/** + * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. + */ +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) +{ + return &kvm_running_vcpu; } static void check_processor_compat(void *rtn) From e174bb94831e792a50877754c78c1f340543bb64 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 16 Jan 2020 11:32:39 +0800 Subject: [PATCH 168/204] KVM: remove unused guest_enter After commit 61bd0f66ff92 ("KVM: PPC: Book3S HV: Fix guest time accounting with VIRT_CPU_ACCOUNTING_GEN"), no one use this function anymore, So better to remove it. Signed-off-by: Alex Shi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: linux-kernel@vger.kernel.org Signed-off-by: Paolo Bonzini --- include/linux/context_tracking.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 64ec82851aa38..8150f5ac176ca 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -154,15 +154,6 @@ static inline void guest_exit_irqoff(void) } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -static inline void guest_enter(void) -{ - unsigned long flags; - - local_irq_save(flags); - guest_enter_irqoff(); - local_irq_restore(flags); -} - static inline void guest_exit(void) { unsigned long flags; From b91991bf6b707482953c094dbd9615f6382ba2cb Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 15 Jan 2020 19:54:32 -0500 Subject: [PATCH 169/204] KVM: nVMX: Check GUEST_DR7 on vmentry of nested guests According to section "Checks on Guest Control Registers, Debug Registers, and and MSRs" in Intel SDM vol 3C, the following checks are performed on vmentry of nested guests: If the "load debug controls" VM-entry control is 1, bits 63:32 in the DR7 field must be 0. In KVM, GUEST_DR7 is set prior to the vmcs02 VM-entry by kvm_set_dr() and the latter synthesizes a #GP if any bit in the high dword in the former is set. Hence this field needs to be checked in software. Signed-off-by: Krish Sadhukhan Reviewed-by: Karl Heubaum Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 53ea65070b5a4..95b3f4306ac2a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2887,6 +2887,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && + CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 48cd4e191b9c5..baf89d4bc653f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1082,7 +1082,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 5: /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) + if (!kvm_dr7_valid(val)) return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e007b61b932a9..2d2ff855773b2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -357,6 +357,12 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } +static inline bool kvm_dr7_valid(unsigned long data) +{ + /* Bits [63:32] are reserved */ + return !(data >> 32); +} + void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); From 17ac43a894ed3dd237d6def00c5ec2c7d975567e Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Thu, 16 Jan 2020 16:50:21 +0800 Subject: [PATCH 170/204] Adding 'else' to reduce checking. These two conditions are in conflict, adding 'else' to reduce checking. Signed-off-by: Haiwei Li Reviewed-by: Vitaly Kuznetsov Reviewed-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 286396c0aa7d6..cce1e6b204c8d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1571,9 +1571,9 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) struct kvm_timer *ktimer = &apic->lapic_timer; kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) + if (apic_lvtt_tscdeadline(apic)) { ktimer->tscdeadline = 0; - if (apic_lvtt_oneshot(apic)) { + } else if (apic_lvtt_oneshot(apic)) { ktimer->tscdeadline = 0; ktimer->target_expiration = 0; } From cef6db76f3165ac01ea49e023dea17002ee91618 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 21 Jan 2020 10:15:18 -0500 Subject: [PATCH 171/204] KVM: VMX: remove duplicated segment cache clear vmx_set_segment() clears segment cache unconditionally, so we should not clear it again by calling vmx_segment_cache_clear(). Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5087bd7062f0a..802ba97ac7f23 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2696,8 +2696,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_segment_cache_clear(vmx); - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); From 4d6d07aee8343eac68ebde9389ba829c8c17dfc7 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Wed, 27 Nov 2019 08:30:25 +0800 Subject: [PATCH 172/204] kvm/x86: export kvm_vector_hashing_enabled() is unnecessary kvm_vector_hashing_enabled() is just called in kvm.ko module. Signed-off-by: Peng Hao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index baf89d4bc653f..7e118883d8f1d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10381,7 +10381,6 @@ bool kvm_vector_hashing_enabled(void) { return vector_hashing; } -EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { From fcfbc617547fc6d9552cb6c1c563b6a90ee98085 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 15:56:18 -0800 Subject: [PATCH 173/204] KVM: Check for a bad hva before dropping into the ghc slow path When reading/writing using the guest/host cache, check for a bad hva before checking for a NULL memslot, which triggers the slow path for handing cross-page accesses. Because the memslot is nullified on error by __kvm_gfn_to_hva_cache_init(), if the bad hva is encountered after crossing into a new page, then the kvm_{read,write}_guest() slow path could potentially write/access the first chunk prior to detecting the bad hva. Arguably, performing a partial access is semantically correct from an architectural perspective, but that behavior is certainly not intended. In the original implementation, memslot was not explicitly nullified and therefore the partial access behavior varied based on whether the memslot itself was null, or if the hva was simply bad. The current behavior was introduced as a seemingly unintentional side effect in commit f1b9dd5eb86c ("kvm: Disallow wraparound in kvm_gfn_to_hva_cache_init"), which justified the change with "since some callers don't check the return code from this function, it sit seems prudent to clear ghc->memslot in the event of an error". Regardless of intent, the partial access is dependent on _not_ checking the result of the cache initialization, which is arguably a bug in its own right, at best simply weird. Fixes: 8f964525a121 ("KVM: Allow cross page reads and writes from cached translations.") Cc: Jim Mattson Cc: Andrew Honig Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7837fd524296d..e8be79db2e0a7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2180,12 +2180,12 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, if (slots->generation != ghc->generation) __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); - if (unlikely(!ghc->memslot)) - return kvm_write_guest(kvm, gpa, data, len); - if (kvm_is_error_hva(ghc->hva)) return -EFAULT; + if (unlikely(!ghc->memslot)) + return kvm_write_guest(kvm, gpa, data, len); + r = __copy_to_user((void __user *)ghc->hva + offset, data, len); if (r) return -EFAULT; @@ -2213,12 +2213,12 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, if (slots->generation != ghc->generation) __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); - if (unlikely(!ghc->memslot)) - return kvm_read_guest(kvm, ghc->gpa, data, len); - if (kvm_is_error_hva(ghc->hva)) return -EFAULT; + if (unlikely(!ghc->memslot)) + return kvm_read_guest(kvm, ghc->gpa, data, len); + r = __copy_from_user(data, (void __user *)ghc->hva, len); if (r) return -EFAULT; From 6ad1e29fe0aba843dfffc714fced0ef6a2e19502 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 14:58:55 -0500 Subject: [PATCH 174/204] KVM: Clean up __kvm_gfn_to_hva_cache_init() and its callers Barret reported a (technically benign) bug where nr_pages_avail can be accessed without being initialized if gfn_to_hva_many() fails. virt/kvm/kvm_main.c:2193:13: warning: 'nr_pages_avail' may be used uninitialized in this function [-Wmaybe-uninitialized] Rather than simply squashing the warning by initializing nr_pages_avail, fix the underlying issues by reworking __kvm_gfn_to_hva_cache_init() to return immediately instead of continuing on. Now that all callers check the result and/or bail immediately on a bad hva, there's no need to explicitly nullify the memslot on error. Reported-by: Barret Rhoden Fixes: f1b9dd5eb86c ("kvm: Disallow wraparound in kvm_gfn_to_hva_cache_init") Cc: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e8be79db2e0a7..7fa56ccdbe500 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2130,33 +2130,36 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; - int r = start_gfn <= end_gfn ? 0 : -EINVAL; - ghc->gpa = gpa; + /* Update ghc->generation before performing any error checks. */ ghc->generation = slots->generation; - ghc->len = len; - ghc->hva = KVM_HVA_ERR_BAD; + + if (start_gfn > end_gfn) { + ghc->hva = KVM_HVA_ERR_BAD; + return -EINVAL; + } /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ - while (!r && start_gfn <= end_gfn) { + for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) - r = -EFAULT; - start_gfn += nr_pages_avail; + return -EFAULT; } /* Use the slow path for cross page reads and writes. */ - if (!r && nr_pages_needed == 1) + if (nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; - return r; + ghc->gpa = gpa; + ghc->len = len; + return 0; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, From dc9ce71e66b84497c375c529d818d8e8d0d793a9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 15:56:20 -0800 Subject: [PATCH 175/204] KVM: Return immediately if __kvm_gfn_to_hva_cache_init() fails Check the result of __kvm_gfn_to_hva_cache_init() and return immediately instead of relying on the kvm_is_error_hva() check to detect errors so that it's abundantly clear KVM intends to immediately bail on an error. Note, the hva check is still mandatory to handle errors on subqeuesnt calls with the same generation. Similarly, always return -EFAULT on error so that multiple (bad) calls for a given generation will get the same result, e.g. on an illegal gfn wrap, propagating the return from __kvm_gfn_to_hva_cache_init() would cause the initial call to return -EINVAL and subsequent calls to return -EFAULT. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa56ccdbe500..ffec9f427b558 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2180,8 +2180,10 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, BUG_ON(len + offset > ghc->len); - if (slots->generation != ghc->generation) - __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); + if (slots->generation != ghc->generation) { + if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) + return -EFAULT; + } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -2213,8 +2215,10 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, BUG_ON(len > ghc->len); - if (slots->generation != ghc->generation) - __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); + if (slots->generation != ghc->generation) { + if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) + return -EFAULT; + } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; From 22b1d57b032cea4d612746473ed28cb20665d876 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:35 -0800 Subject: [PATCH 176/204] KVM: x86/mmu: Enforce max_level on HugeTLB mappings Limit KVM's mapping level for HugeTLB based on its calculated max_level. The max_level check prior to invoking host_mapping_level() only filters out the case where KVM cannot create a 2mb mapping, it doesn't handle the scenario where KVM can create a 2mb but not 1gb mapping, and the host is using a 1gb HugeTLB mapping. Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b9052c7ba43d2..db597f57cdc26 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1330,7 +1330,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int *max_levelp) { - int max_level = *max_levelp; + int host_level, max_level = *max_levelp; struct kvm_memory_slot *slot; if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) @@ -1362,7 +1362,8 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, * So, do not propagate host_mapping_level() to max_level as KVM can * still promote the guest mapping to a huge page in the THP case. */ - return host_mapping_level(vcpu->kvm, large_gfn); + host_level = host_mapping_level(vcpu->kvm, large_gfn); + return min(host_level, max_level); } /* From 005ba37cb89bcc0cf63c2029a41f8db165aeb615 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:36 -0800 Subject: [PATCH 177/204] mm: thp: KVM: Explicitly check for THP when populating secondary MMU Add a helper, is_transparent_hugepage(), to explicitly check whether a compound page is a THP and use it when populating KVM's secondary MMU. The explicit check fixes a bug where a remapped compound page, e.g. for an XDP Rx socket, is mapped into a KVM guest and is mistaken for a THP, which results in KVM incorrectly creating a huge page in its secondary MMU. Fixes: 936a5fe6e6148 ("thp: kvm mmu transparent hugepage support") Reported-by: syzbot+c9d1fb51ac9d0d10c39d@syzkaller.appspotmail.com Cc: Andrea Arcangeli Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- include/linux/huge_mm.h | 6 ++++++ include/linux/kvm_host.h | 1 + mm/huge_memory.c | 11 +++++++++++ virt/kvm/arm/mmu.c | 8 +------- virt/kvm/kvm_main.c | 10 ++++++++++ 6 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index db597f57cdc26..7eb21a22cc135 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3344,7 +3344,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn))) { + kvm_is_transparent_hugepage(pfn)) { unsigned long mask; /* @@ -5961,7 +5961,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + kvm_is_transparent_hugepage(pfn)) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0b84e13e88e2d..5aca3d1bdb32c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -160,6 +160,7 @@ extern unsigned long thp_get_unmapped_area(struct file *filp, extern void prep_transhuge_page(struct page *page); extern void free_transhuge_page(struct page *page); +bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); @@ -308,6 +309,11 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, static inline void prep_transhuge_page(struct page *page) {} +static inline bool is_transparent_hugepage(struct page *page) +{ + return false; +} + #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 48e139c293c23..46fdb75336787 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -976,6 +976,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); +bool kvm_is_transparent_hugepage(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 41a0fbddc96bb..9b3ee79d0edf2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,6 +527,17 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +bool is_transparent_hugepage(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return is_huge_zero_page(page) || + page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; +} +EXPORT_SYMBOL_GPL(is_transparent_hugepage); + static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 0b32a904a1bb3..dc8254bf30ea0 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1377,14 +1377,8 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - /* - * PageTransCompoundMap() returns true for THP and - * hugetlbfs. Make sure the adjustment is done only for THP - * pages. - */ - if (!PageHuge(page) && PageTransCompoundMap(page)) { + if (kvm_is_transparent_hugepage(pfn)) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ffec9f427b558..64e9e9d65ed4e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -191,6 +191,16 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) return true; } +bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + + if (!PageTransCompoundMap(page)) + return false; + + return is_transparent_hugepage(compound_head(page)); +} + /* * Switches to specified vcpu, until a matching vcpu_put() */ From f9b84e19221efc5f493156ee0329df3142085f28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:37 -0800 Subject: [PATCH 178/204] KVM: Use vcpu-specific gva->hva translation when querying host page size Use kvm_vcpu_gfn_to_hva() when retrieving the host page size so that the correct set of memslots is used when handling x86 page faults in SMM. Fixes: 54bf36aac520 ("KVM: x86: use vcpu-specific functions to read/write/translate GFNs") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_xive_native.c | 2 +- arch/x86/kvm/mmu/mmu.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index d83adb1e14902..6ef0151ff70a9 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -631,7 +631,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, srcu_idx = srcu_read_lock(&kvm->srcu); gfn = gpa_to_gfn(kvm_eq.qaddr); - page_size = kvm_host_page_size(kvm, gfn); + page_size = kvm_host_page_size(vcpu, gfn); if (1ull << kvm_eq.qshift > page_size) { srcu_read_unlock(&kvm->srcu, srcu_idx); pr_warn("Incompatible host page size %lx!\n", page_size); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7eb21a22cc135..e4458c9aec8c8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1286,12 +1286,12 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } -static int host_mapping_level(struct kvm *kvm, gfn_t gfn) +static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn) { unsigned long page_size; int i, ret = 0; - page_size = kvm_host_page_size(kvm, gfn); + page_size = kvm_host_page_size(vcpu, gfn); for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) @@ -1362,7 +1362,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, * So, do not propagate host_mapping_level() to max_level as KVM can * still promote the guest mapping to a huge page in the THP case. */ - host_level = host_mapping_level(vcpu->kvm, large_gfn); + host_level = host_mapping_level(vcpu, large_gfn); return min(host_level, max_level); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 46fdb75336787..6d5331b0d9378 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -762,7 +762,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64e9e9d65ed4e..f6f8ffc2e865c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1402,14 +1402,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; - addr = gfn_to_hva(kvm, gfn); + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return PAGE_SIZE; From 42cde48b2d39772dba47e680781a32a6c4b7dc33 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:38 -0800 Subject: [PATCH 179/204] KVM: Play nice with read-only memslots when querying host page size Avoid the "writable" check in __gfn_to_hva_many(), which will always fail on read-only memslots due to gfn_to_hva() assuming writes. Functionally, this allows x86 to create large mappings for read-only memslots that are backed by HugeTLB mappings. Note, the changelog for commit 05da45583de9 ("KVM: MMU: large page support") states "If the largepage contains write-protected pages, a large pte is not used.", but "write-protected" refers to pages that are temporarily read-only, e.g. read-only memslots didn't even exist at the time. Fixes: 4d8b81abc47b ("KVM: introduce readonly memslot") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson [Redone using kvm_vcpu_gfn_to_memslot_prot. - Paolo] Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f6f8ffc2e865c..eb3709d551394 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1409,7 +1409,7 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) size = PAGE_SIZE; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); + addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); if (kvm_is_error_hva(addr)) return PAGE_SIZE; From 13c72c060f1ba6f4eddd7b1c4f52a8aded43d6d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:39 -0800 Subject: [PATCH 180/204] x86/mm: Introduce lookup_address_in_mm() Add a helper, lookup_address_in_mm(), to traverse the page tables of a given mm struct. KVM will use the helper to retrieve the host mapping level, e.g. 4k vs. 2mb vs. 1gb, of a compound (or DAX-backed) page without having to resort to implementation specific metadata. E.g. KVM currently uses different logic for HugeTLB vs. THP, and would add a third variant for DAX-backed files. Cc: Dan Williams Signed-off-by: Sean Christopherson Reviewed-by: Thomas Gleixner Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pgtable_types.h | 4 ++++ arch/x86/mm/pageattr.c | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b5e49e6bac635..b68f72adb53ed 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -561,6 +561,10 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); + +struct mm_struct; +extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, + unsigned int *level); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b99ad05b1177..2c70a8b20b04d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -618,6 +618,17 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) } EXPORT_SYMBOL_GPL(lookup_address); +/* + * Lookup the page table entry for a virtual address in a given mm. Return a + * pointer to the entry and the level of the mapping. + */ +pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, + unsigned int *level) +{ + return lookup_address_in_pgd(pgd_offset(mm, address), address, level); +} +EXPORT_SYMBOL_GPL(lookup_address_in_mm); + static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { From 17eff01904f5f2fa12f4a56666637ce69ce5c645 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:40 -0800 Subject: [PATCH 181/204] KVM: x86/mmu: Refactor THP adjust to prep for changing query Refactor transparent_hugepage_adjust() in preparation for walking the host page tables to identify hugepage mappings, initially for THP pages, and eventualy for HugeTLB and DAX-backed pages as well. The latter cases support 1gb pages, i.e. the adjustment logic needs access to the max allowed level. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++----------------- arch/x86/kvm/mmu/paging_tmpl.h | 3 +-- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e4458c9aec8c8..64c28a39d8ef7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3329,33 +3329,34 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t gfn, kvm_pfn_t *pfnp, +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; int level = *levelp; + kvm_pfn_t mask; + + if (max_level == PT_PAGE_TABLE_LEVEL || level > PT_PAGE_TABLE_LEVEL) + return; + + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || + kvm_is_zone_device_pfn(pfn)) + return; + + if (!kvm_is_transparent_hugepage(pfn)) + return; + + level = PT_DIRECTORY_LEVEL; /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. + * mmu_notifier_retry() was successful and mmu_lock is held, so + * the pmd can't be split from under us. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - kvm_is_transparent_hugepage(pfn)) { - unsigned long mask; - - /* - * mmu_notifier_retry() was successful and mmu_lock is held, so - * the pmd can't be split from under us. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - *pfnp = pfn & ~mask; - } + *levelp = level; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + *pfnp = pfn & ~mask; } static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, @@ -3395,8 +3396,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - if (likely(max_level > PT_PAGE_TABLE_LEVEL)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, gfn, max_level, &pfn, &level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index eaa00c4daeb11..1ad87f0e19d0e 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -688,8 +688,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); base_gfn = gfn; - if (max_level > PT_PAGE_TABLE_LEVEL) - transparent_hugepage_adjust(vcpu, gw->gfn, &pfn, &hlevel); + transparent_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, &hlevel); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); From db5432165e9b51d2a36572be38d078e79f8df0d8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:41 -0800 Subject: [PATCH 182/204] KVM: x86/mmu: Walk host page tables to find THP mappings Explicitly walk the host page tables to identify THP mappings instead of relying solely on the metadata in struct page. This sets the stage for using a common method of identifying huge mappings regardless of the underlying implementation (HugeTLB vs THB vs DAX), and hopefully avoids the pitfalls of relying on metadata to identify THP mappings, e.g. see commit 169226f7e0d2 ("mm: thp: handle page cache THP correctly in PageTransCompoundMap") and the need for KVM to explicitly check for a THP compound page. KVM will also naturally work with 1gb THP pages, if they are ever supported. Walking the tables for THP mappings is likely marginally slower than querying metadata, but a future patch will reuse the walk to identify HugeTLB mappings, at which point eliminating the existing VMA lookup for HugeTLB will make this a net positive. Cc: Andrea Arcangeli Cc: Barret Rhoden Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64c28a39d8ef7..3acaadb7acb88 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3329,6 +3329,34 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, + kvm_pfn_t pfn) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + pte_t *pte; + int level; + + BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || + PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || + PT_PDPE_LEVEL != (int)PG_LEVEL_1G); + + if (!PageCompound(pfn_to_page(pfn))) + return PT_PAGE_TABLE_LEVEL; + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); + if (!slot) + return PT_PAGE_TABLE_LEVEL; + + hva = __gfn_to_hva_memslot(slot, gfn); + + pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + if (unlikely(!pte)) + return PT_PAGE_TABLE_LEVEL; + + return level; +} + static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, int *levelp) @@ -3344,10 +3372,11 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_is_zone_device_pfn(pfn)) return; - if (!kvm_is_transparent_hugepage(pfn)) + level = host_pfn_mapping_level(vcpu, gfn, pfn); + if (level == PT_PAGE_TABLE_LEVEL) return; - level = PT_DIRECTORY_LEVEL; + level = min(level, max_level); /* * mmu_notifier_retry() was successful and mmu_lock is held, so From f9fa2509e5ca8229b4baca295865b542803bf25d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:42 -0800 Subject: [PATCH 183/204] KVM: x86/mmu: Drop level optimization from fast_page_fault() Remove fast_page_fault()'s optimization to stop the shadow walk if the iterator level drops below the intended map level. The intended map level is only acccurate for HugeTLB mappings (THP mappings are detected after fast_page_fault()), i.e. it's not required for correctness, and a future patch will also move HugeTLB mapping detection to after fast_page_fault(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3acaadb7acb88..17645c2d23e12 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3586,7 +3586,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) * - true: let the vcpu to access on the same address again. * - false: let the real page fault path to fix it. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, +static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code) { struct kvm_shadow_walk_iterator iterator; @@ -3604,8 +3604,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, u64 new_spte; for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte) || - iterator.level < level) + if (!is_shadow_present_pte(spte)) break; sp = page_header(__pa(iterator.sptep)); @@ -4218,7 +4217,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (level > PT_PAGE_TABLE_LEVEL) gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - if (fast_page_fault(vcpu, gpa, level, error_code)) + if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; From 83f06fa7a6fd9d5758e5f8438e2137f25f6f2e6b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:43 -0800 Subject: [PATCH 184/204] KVM: x86/mmu: Rely on host page tables to find HugeTLB mappings Remove KVM's HugeTLB specific logic and instead rely on walking the host page tables (already done for THP) to identify HugeTLB mappings. Eliminating the HugeTLB-only logic avoids taking mmap_sem and calling find_vma() for all hugepage compatible page faults, and simplifies KVM's page fault code by consolidating all hugepage adjustments into a common helper. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 84 ++++++++++------------------------ arch/x86/kvm/mmu/paging_tmpl.h | 15 +++--- 2 files changed, 29 insertions(+), 70 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17645c2d23e12..6be0239dcfbff 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1286,23 +1286,6 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } -static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - unsigned long page_size; - int i, ret = 0; - - page_size = kvm_host_page_size(vcpu, gfn); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - if (page_size >= KVM_HPAGE_SIZE(i)) - ret = i; - else - break; - } - - return ret; -} - static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, bool no_dirty_log) { @@ -1327,43 +1310,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, return slot; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, - int *max_levelp) +static int max_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level) { - int host_level, max_level = *max_levelp; struct kvm_memory_slot *slot; if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - if (!memslot_valid_for_gpte(slot, true)) { - *max_levelp = PT_PAGE_TABLE_LEVEL; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!memslot_valid_for_gpte(slot, true)) return PT_PAGE_TABLE_LEVEL; - } max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { - if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot)) + if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) break; } - *max_levelp = max_level; - - if (max_level == PT_PAGE_TABLE_LEVEL) - return PT_PAGE_TABLE_LEVEL; - - /* - * Note, host_mapping_level() does *not* handle transparent huge pages. - * As suggested by "mapping", it reflects the page size established by - * the associated vma, if there is one, i.e. host_mapping_level() will - * return a huge page level if and only if a vma exists and the backing - * implementation for the vma uses huge pages, e.g. hugetlbfs and dax. - * So, do not propagate host_mapping_level() to max_level as KVM can - * still promote the guest mapping to a huge page in the THP case. - */ - host_level = host_mapping_level(vcpu, large_gfn); - return min(host_level, max_level); + return max_level; } /* @@ -3137,7 +3102,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, /* * Other vcpu creates new sp in the window between - * mapping_level() and acquiring mmu-lock. We can + * max_mapping_level() and acquiring mmu-lock. We can * allow guest to retry the access, the mapping can * be fixed if guest refault. */ @@ -3357,24 +3322,23 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - int *levelp) +static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp) { kvm_pfn_t pfn = *pfnp; - int level = *levelp; kvm_pfn_t mask; + int level; - if (max_level == PT_PAGE_TABLE_LEVEL || level > PT_PAGE_TABLE_LEVEL) - return; + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || kvm_is_zone_device_pfn(pfn)) - return; + return PT_PAGE_TABLE_LEVEL; level = host_pfn_mapping_level(vcpu, gfn, pfn); if (level == PT_PAGE_TABLE_LEVEL) - return; + return level; level = min(level, max_level); @@ -3382,10 +3346,11 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, * mmu_notifier_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ - *levelp = level; mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); *pfnp = pfn & ~mask; + + return level; } static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, @@ -3412,20 +3377,19 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, - int map_writable, int level, int max_level, - kvm_pfn_t pfn, bool prefault, - bool account_disallowed_nx_lpage) + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault, bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int ret; + int level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - transparent_hugepage_adjust(vcpu, gfn, max_level, &pfn, &level); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { @@ -4201,7 +4165,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; - int level, r; + int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4213,9 +4177,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (lpage_disallowed) max_level = PT_PAGE_TABLE_LEVEL; - level = mapping_level(vcpu, gfn, &max_level); - if (level > PT_PAGE_TABLE_LEVEL) - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + max_level = max_mapping_level(vcpu, gfn, max_level); if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; @@ -4235,7 +4197,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn, + r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); out_unlock: diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1ad87f0e19d0e..472c32cdf2ff5 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -628,14 +628,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, - int write_fault, int hlevel, int max_level, + int write_fault, int max_level, kvm_pfn_t pfn, bool map_writable, bool prefault, bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, ret; + int top_level, hlevel, ret; gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -688,7 +688,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); base_gfn = gfn; - transparent_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, &hlevel); + hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -790,7 +790,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; - int level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && @@ -840,9 +839,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, else max_level = walker.level; - level = mapping_level(vcpu, walker.gfn, &max_level); - if (level > PT_PAGE_TABLE_LEVEL) - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + max_level = max_mapping_level(vcpu, walker.gfn, max_level); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -882,8 +879,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, max_level, - pfn, map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, + map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: From 09c4453ee8e63a710774ae10da7909289aa6a58e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:44 -0800 Subject: [PATCH 185/204] KVM: x86/mmu: Remove obsolete gfn restoration in FNAME(fetch) Remove logic to retrieve the original gfn now that HugeTLB mappings are are identified in FNAME(fetch), i.e. FNAME(page_fault) no longer adjusts the level or gfn. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 472c32cdf2ff5..885da924827c1 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -636,7 +636,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, hlevel, ret; - gfn_t gfn, base_gfn; + gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -681,13 +681,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - /* - * FNAME(page_fault) might have clobbered the bottom bits of - * gw->gfn, restore them from the virtual address. - */ - gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); - base_gfn = gfn; - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -699,9 +692,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; From d32ec81bab670e599e645e1d1d5231d62de7d0d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:45 -0800 Subject: [PATCH 186/204] KVM: x86/mmu: Zap any compound page when collapsing sptes Zap any compound page, e.g. THP or HugeTLB pages, when zapping sptes that can potentially be converted to huge sptes after disabling dirty logging on the associated memslot. Note, this approach could result in false positives, e.g. if a random compound page is mapped into the guest, but mapping non-huge compound pages into the guest is far from the norm, and toggling dirty logging is not a frequent operation. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6be0239dcfbff..9090842ccd10b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5951,7 +5951,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && - kvm_is_transparent_hugepage(pfn)) { + PageCompound(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) From 293e306e7faac4eafaefb9518a1cd6eaecad88e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:46 -0800 Subject: [PATCH 187/204] KVM: x86/mmu: Fold max_mapping_level() into kvm_mmu_hugepage_adjust() Fold max_mapping_level() into kvm_mmu_hugepage_adjust() now that HugeTLB mappings are handled in kvm_mmu_hugepage_adjust(), i.e. there isn't a need to pre-calculate the max mapping level. Co-locating all hugepage checks eliminates a memslot lookup, at the cost of performing the __mmu_gfn_lpage_is_disallowed() checks while holding mmu_lock. The latency of lpage_is_disallowed() is likely negligible relative to the rest of the code run while holding mmu_lock, and can be offset to some extent by eliminating the mmu_gfn_lpage_is_disallowed() check in set_spte() in a future patch. Eliminating the check in set_spte() is made possible by performing the initial lpage_is_disallowed() checks while holding mmu_lock. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 65 ++++++++++++++++------------------ arch/x86/kvm/mmu/paging_tmpl.h | 2 -- 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9090842ccd10b..812c69f7f552c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1310,27 +1310,6 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, return slot; } -static int max_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level) -{ - struct kvm_memory_slot *slot; - - if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) - return PT_PAGE_TABLE_LEVEL; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!memslot_valid_for_gpte(slot, true)) - return PT_PAGE_TABLE_LEVEL; - - max_level = min(max_level, kvm_x86_ops->get_lpage_level()); - for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { - if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) - break; - } - - return max_level; -} - /* * About rmap_head encoding: * @@ -3101,10 +3080,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { /* - * Other vcpu creates new sp in the window between - * max_mapping_level() and acquiring mmu-lock. We can - * allow guest to retry the access, the mapping can - * be fixed if guest refault. + * Legacy code to handle an obsolete scenario where a different + * vcpu creates new sp in the window between this vcpu's query + * of lpage_is_disallowed() and acquiring mmu_lock. No longer + * necessary now that lpage_is_disallowed() is called after + * acquiring mmu_lock. */ if (level > PT_PAGE_TABLE_LEVEL && mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) @@ -3295,9 +3275,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - kvm_pfn_t pfn) + kvm_pfn_t pfn, struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; unsigned long hva; pte_t *pte; int level; @@ -3309,10 +3288,14 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, if (!PageCompound(pfn_to_page(pfn))) return PT_PAGE_TABLE_LEVEL; - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); - if (!slot) - return PT_PAGE_TABLE_LEVEL; - + /* + * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() + * is not solely for performance, it's also necessary to avoid the + * "writable" check in __gfn_to_hva_many(), which will always fail on + * read-only memslots due to gfn_to_hva() assuming writes. Earlier + * page fault steps have already verified the guest isn't writing a + * read-only memslot. + */ hva = __gfn_to_hva_memslot(slot, gfn); pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); @@ -3325,18 +3308,32 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp) { + struct kvm_memory_slot *slot; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; - if (max_level == PT_PAGE_TABLE_LEVEL) + if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || kvm_is_zone_device_pfn(pfn)) return PT_PAGE_TABLE_LEVEL; - level = host_pfn_mapping_level(vcpu, gfn, pfn); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); + if (!slot) + return PT_PAGE_TABLE_LEVEL; + + max_level = min(max_level, kvm_x86_ops->get_lpage_level()); + for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) + break; + } + + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; + + level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); if (level == PT_PAGE_TABLE_LEVEL) return level; @@ -4177,8 +4174,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (lpage_disallowed) max_level = PT_PAGE_TABLE_LEVEL; - max_level = max_mapping_level(vcpu, gfn, max_level); - if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 885da924827c1..4e1ef04736634 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -832,8 +832,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, else max_level = walker.level; - max_level = max_mapping_level(vcpu, walker.gfn, max_level); - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); From 2c0629f4b95cf5adf5b6f78f7d318df894b5f9a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:47 -0800 Subject: [PATCH 188/204] KVM: x86/mmu: Remove lpage_is_disallowed() check from set_spte() Remove the late "lpage is disallowed" check from set_spte() now that the initial check is performed after acquiring mmu_lock. Fold the guts of the remaining helper, __mmu_gfn_lpage_is_disallowed(), into kvm_mmu_hugepage_adjust() to eliminate the unnecessary slot !NULL check. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 39 +++------------------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 812c69f7f552c..a9e6683c802b5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1264,28 +1264,6 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->lpage_disallowed_link); } -static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, - struct kvm_memory_slot *slot) -{ - struct kvm_lpage_info *linfo; - - if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return !!linfo->disallow_lpage; - } - - return true; -} - -static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, - int level) -{ - struct kvm_memory_slot *slot; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); -} - static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, bool no_dirty_log) { @@ -3078,18 +3056,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - - /* - * Legacy code to handle an obsolete scenario where a different - * vcpu creates new sp in the window between this vcpu's query - * of lpage_is_disallowed() and acquiring mmu_lock. No longer - * necessary now that lpage_is_disallowed() is called after - * acquiring mmu_lock. - */ - if (level > PT_PAGE_TABLE_LEVEL && - mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) - goto done; - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; /* @@ -3121,7 +3087,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; -done: return ret; } @@ -3309,6 +3274,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp) { struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; @@ -3326,7 +3292,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { - if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) break; } From e851265a816f96a86c5a0316d2fc4d45be76d1d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:48 -0800 Subject: [PATCH 189/204] KVM: x86/mmu: Use huge pages for DAX-backed files Walk the host page tables to identify hugepage mappings for ZONE_DEVICE pfns, i.e. DAX pages. Explicitly query kvm_is_zone_device_pfn() when deciding whether or not to bother walking the host page tables, as DAX pages do not set up the head/tail infrastructure, i.e. will return false for PageCompound() even when using huge pages. Zap ZONE_DEVICE sptes when disabling dirty logging, e.g. if live migration fails, to allow KVM to rebuild large pages for DAX-based mappings. Presumably DAX favors large pages, and worst case scenario is a minor performance hit as KVM will need to re-fault all DAX-based pages. Suggested-by: Barret Rhoden Cc: David Hildenbrand Cc: Dan Williams Cc: Jason Zeng Cc: Dave Jiang Cc: Liran Alon Cc: linux-nvdimm Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9e6683c802b5..febd65a9721ad 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3250,7 +3250,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || PT_PDPE_LEVEL != (int)PG_LEVEL_1G); - if (!PageCompound(pfn_to_page(pfn))) + if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) return PT_PAGE_TABLE_LEVEL; /* @@ -3282,8 +3282,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; - if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || - kvm_is_zone_device_pfn(pfn)) + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) return PT_PAGE_TABLE_LEVEL; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); @@ -5912,8 +5911,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && - PageCompound(pfn_to_page(pfn))) { + (kvm_is_zone_device_pfn(pfn) || + PageCompound(pfn_to_page(pfn)))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) From 91b0d268a59dd9c18221ea750b80f9a317b29ed2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Jan 2020 16:16:32 +0100 Subject: [PATCH 190/204] KVM: x86: inline memslot_valid_for_gpte The function now has a single caller, so there is no point in keeping it separate. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index febd65a9721ad..84eeb61d06aa3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1264,17 +1264,6 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->lpage_disallowed_link); } -static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, - bool no_dirty_log) -{ - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return false; - if (no_dirty_log && slot->dirty_bitmap) - return false; - - return true; -} - static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -1282,8 +1271,10 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!memslot_valid_for_gpte(slot, no_dirty_log)) - slot = NULL; + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return NULL; + if (no_dirty_log && slot->dirty_bitmap) + return NULL; return slot; } From 52db369823b28616377b8ceb6b6b3879735b9e75 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 22 Jan 2020 11:21:44 +0800 Subject: [PATCH 191/204] KVM: X86: Add 'else' to unify fastop and execute call path It also helps eliminate some duplicated code. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c7a0da45f60ad..0accce94f6608 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5683,11 +5683,9 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (ctxt->d & Fastop) { void (*fop)(struct fastop *) = (void *)ctxt->execute; rc = fastop(ctxt, fop); - if (rc != X86EMUL_CONTINUE) - goto done; - goto writeback; + } else { + rc = ctxt->execute(ctxt); } - rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; goto writeback; From 3009afc6e39e78708d8fb444ae50544b3bcd3a3f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 21 Jan 2020 20:43:39 -0800 Subject: [PATCH 192/204] KVM: x86: Use a typedef for fastop functions Add a typedef to for the fastop function prototype to make the code more readable. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0accce94f6608..ddbc61984227c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -311,7 +311,9 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) #define ON64(x) #endif -static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); +typedef void (*fastop_t)(struct fastop *); + +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); #define __FOP_FUNC(name) \ ".align " __stringify(FASTOP_SIZE) " \n\t" \ @@ -5502,7 +5504,7 @@ static void fetch_possible_mmx_operand(struct operand *op) read_mmx_reg(&op->mm_val, op->addr.mm); } -static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; @@ -5680,12 +5682,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~X86_EFLAGS_RF; if (ctxt->execute) { - if (ctxt->d & Fastop) { - void (*fop)(struct fastop *) = (void *)ctxt->execute; - rc = fastop(ctxt, fop); - } else { + if (ctxt->d & Fastop) + rc = fastop(ctxt, (fastop_t)ctxt->execute); + else rc = ctxt->execute(ctxt); - } if (rc != X86EMUL_CONTINUE) goto done; goto writeback; From 3837407c1aa1101ed5e214c7d6041e7a23335c6e Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 24 Jan 2020 15:25:32 +0100 Subject: [PATCH 193/204] KVM: arm64: pmu: Don't increment SW_INCR if PMCR.E is unset The specification says PMSWINC increments PMEVCNTR_EL1 by 1 if PMEVCNTR_EL0 is enabled and configured to count SW_INCR. For PMEVCNTR_EL0 to be enabled, we need both PMCNTENSET to be set for the corresponding event counter but we also need the PMCR.E bit to be set. Fixes: 7a0adc7064b8 ("arm64: KVM: Add access handler for PMSWINC register") Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Reviewed-by: Andrew Murray Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20200124142535.29386-2-eric.auger@redhat.com --- virt/kvm/arm/pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 8731dfeced8b7..c3f8b059881e1 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -486,6 +486,9 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) if (val == 0) return; + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + return; + enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { if (!(val & BIT(i))) From 76c9fc56ddfdfeb0c9ff984d0d63b083e608fc92 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 24 Jan 2020 15:25:33 +0100 Subject: [PATCH 194/204] KVM: arm64: pmu: Don't mark a counter as chained if the odd one is disabled At the moment we update the chain bitmap on type setting. This does not take into account the enable state of the odd register. Let's make sure a counter is never considered as chained if the high counter is disabled. We recompute the chain state on enable/disable and type changes. Also let create_perf_event() use the chain bitmap and not use kvm_pmu_idx_has_chain_evtype(). Suggested-by: Marc Zyngier Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200124142535.29386-3-eric.auger@redhat.com --- virt/kvm/arm/pmu.c | 62 ++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index c3f8b059881e1..9f605e0b8dd74 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -15,6 +15,8 @@ #include static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); #define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 @@ -75,6 +77,13 @@ static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc) return pmc; } +static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) +{ + if (kvm_pmu_idx_is_high_counter(pmc->idx)) + return pmc - 1; + else + return pmc + 1; +} /** * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain @@ -294,15 +303,9 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) pmc = &pmu->pmc[i]; - /* - * For high counters of chained events we must recreate the - * perf event with the long (64bit) attribute set. - */ - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(i)) { - kvm_pmu_create_perf_event(vcpu, i); - continue; - } + /* A change in the enable state may affect the chain state */ + kvm_pmu_update_pmc_chained(vcpu, i); + kvm_pmu_create_perf_event(vcpu, i); /* At this point, pmc must be the canonical */ if (pmc->perf_event) { @@ -335,15 +338,9 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) pmc = &pmu->pmc[i]; - /* - * For high counters of chained events we must recreate the - * perf event with the long (64bit) attribute unset. - */ - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(i)) { - kvm_pmu_create_perf_event(vcpu, i); - continue; - } + /* A change in the enable state may affect the chain state */ + kvm_pmu_update_pmc_chained(vcpu, i); + kvm_pmu_create_perf_event(vcpu, i); /* At this point, pmc must be the canonical */ if (pmc->perf_event) @@ -585,15 +582,14 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - if (kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx)) { + if (kvm_pmu_pmc_is_chained(pmc)) { /** * The initial sample period (overflow count) of an event. For * chained counters we only support overflow interrupts on the * high counter. */ attr.sample_period = (-counter) & GENMASK(63, 0); - if (kvm_pmu_counter_is_enabled(vcpu, pmc->idx + 1)) - attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; + attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, @@ -624,25 +620,33 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) * @select_idx: The number of selected counter * * Update the chained bitmap based on the event type written in the - * typer register. + * typer register and the enable state of the odd register. */ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) { struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; + struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; + bool new_state, old_state; + + old_state = kvm_pmu_pmc_is_chained(pmc); + new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && + kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); - if (kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx)) { + if (old_state == new_state) + return; + + canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); + kvm_pmu_stop_counter(vcpu, canonical_pmc); + if (new_state) { /* * During promotion from !chained to chained we must ensure * the adjacent counter is stopped and its event destroyed */ - if (!kvm_pmu_pmc_is_chained(pmc)) - kvm_pmu_stop_counter(vcpu, pmc); - + kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); - } else { - clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); + return; } + clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); } /** From aa76829171e98bd75a0cc00b6248eca269ac7f4f Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 24 Jan 2020 15:25:34 +0100 Subject: [PATCH 195/204] KVM: arm64: pmu: Fix chained SW_INCR counters At the moment a SW_INCR counter always overflows on 32-bit boundary, independently on whether the n+1th counter is programmed as CHAIN. Check whether the SW_INCR counter is a 64b counter and if so, implement the 64b logic. Fixes: 80f393a23be6 ("KVM: arm/arm64: Support chained PMU counters") Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200124142535.29386-4-eric.auger@redhat.com --- virt/kvm/arm/pmu.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 9f605e0b8dd74..560db62821374 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -477,28 +477,45 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, */ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) { + struct kvm_pmu *pmu = &vcpu->arch.pmu; int i; - u64 type, enable, reg; - - if (val == 0) - return; if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) return; - enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + /* Weed out disabled counters */ + val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { + u64 type, reg; + if (!(val & BIT(i))) continue; - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i) - & ARMV8_PMU_EVTYPE_EVENT; - if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR) - && (enable & BIT(i))) { - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; + + /* PMSWINC only applies to ... SW_INC! */ + type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); + type &= ARMV8_PMU_EVTYPE_EVENT; + if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) + continue; + + /* increment this even SW_INC counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; + + if (reg) /* no overflow on the low part */ + continue; + + if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { + /* increment the high counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - if (!reg) - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; + if (!reg) /* mark overflow on the high counter */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); + } else { + /* mark overflow on low counter */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); } } } From c01d6a18023b94fdd0cb7cf11bbfe769bf71653f Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 24 Jan 2020 15:25:35 +0100 Subject: [PATCH 196/204] KVM: arm64: pmu: Only handle supported event counters Let the code never use unsupported event counters. Change kvm_pmu_handle_pmcr() to only reset supported counters and kvm_pmu_vcpu_reset() to only stop supported counters. Other actions are filtered on the supported counters in kvm/sysregs.c Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200124142535.29386-5-eric.auger@redhat.com --- virt/kvm/arm/pmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 560db62821374..f0d0312c0a552 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -247,10 +247,11 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) */ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { - int i; + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for_each_set_bit(i, &mask, 32) kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); @@ -527,10 +528,9 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { - u64 mask; + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); int i; - mask = kvm_pmu_valid_counter_mask(vcpu); if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); @@ -542,7 +542,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) + for_each_set_bit(i, &mask, 32) kvm_pmu_set_counter_value(vcpu, i, 0); } } From 4a267aa707953a9a73d1f5dc7f894dd9024a92be Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Mon, 27 Jan 2020 10:36:52 +0000 Subject: [PATCH 197/204] KVM: arm64: Treat emulated TVAL TimerValue as a signed 32-bit integer According to the ARM ARM, registers CNT{P,V}_TVAL_EL0 have bits [63:32] RES0 [1]. When reading the register, the value is truncated to the least significant 32 bits [2], and on writes, TimerValue is treated as a signed 32-bit integer [1, 2]. When the guest behaves correctly and writes 32-bit values, treating TVAL as an unsigned 64 bit register works as expected. However, things start to break down when the guest writes larger values, because (u64)0x1_ffff_ffff = 8589934591. but (s32)0x1_ffff_ffff = -1, and the former will cause the timer interrupt to be asserted in the future, but the latter will cause it to be asserted now. Let's treat TVAL as a signed 32-bit register on writes, to match the behaviour described in the architecture, and the behaviour experimentally exhibited by the virtual timer on a non-vhe host. [1] Arm DDI 0487E.a, section D13.8.18 [2] Arm DDI 0487E.a, section D11.2.4 Signed-off-by: Alexandru Elisei [maz: replaced the read-side mask with lower_32_bits] Signed-off-by: Marc Zyngier Fixes: 8fa761624871 ("KVM: arm/arm64: arch_timer: Fix CNTP_TVAL calculation") Link: https://lore.kernel.org/r/20200127103652.2326-1-alexandru.elisei@arm.com --- virt/kvm/arm/arch_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f182b23803457..c6c2a9dde00cd 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -805,6 +805,7 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, switch (treg) { case TIMER_REG_TVAL: val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; + val &= lower_32_bits(val); break; case TIMER_REG_CTL: @@ -850,7 +851,7 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, { switch (treg) { case TIMER_REG_TVAL: - timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val; + timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val; break; case TIMER_REG_CTL: From e032e3b55b6f487e48c163c5dca74086f147a169 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Wed, 22 Jan 2020 10:25:42 +0530 Subject: [PATCH 198/204] KVM: PPC: Book3S HV: Release lock on page-out failure path When migrate_vma_setup() fails in kvmppc_svm_page_out(), release kvm->arch.uvmem_lock before returning. Fixes: ca9f4942670 ("KVM: PPC: Book3S HV: Support for running secure guests") Signed-off-by: Bharata B Rao Reviewed-by: Kamalesh Babulal Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 4d1f25a3959a5..79b1202b1c628 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -571,7 +571,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, ret = migrate_vma_setup(&mig); if (ret) - return ret; + goto out; spage = migrate_pfn_to_page(*mig.src); if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE)) From fd24a8624eb29d3b6b7df68096ce0321b19b03c6 Mon Sep 17 00:00:00 2001 From: David Michael Date: Sun, 26 Jan 2020 17:31:58 -0500 Subject: [PATCH 199/204] KVM: PPC: Book3S PR: Fix -Werror=return-type build failure Fixes: 3a167beac07c ("kvm: powerpc: Add kvmppc_ops callback") Signed-off-by: David Michael Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_pr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index ce4fcf76e53e9..eb86a2f26986f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -2030,6 +2030,7 @@ static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, { /* We should not get called */ BUG(); + return 0; } #endif /* CONFIG_PPC64 */ From 8c6de56a42e0c657955e12b882a81ef07d1d073e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 30 Oct 2019 19:01:31 +0000 Subject: [PATCH 200/204] x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit kvm_steal_time_set_preempted() may accidentally clear KVM_VCPU_FLUSH_TLB bit if it is called more than once while VCPU is preempted. This is part of CVE-2019-3016. (This bug was also independently discovered by Jim Mattson ) Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf917139de6ba..8c9369151e9f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3504,6 +3504,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; + if (vcpu->arch.st.steal.preempted) + return; + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, From 1eff70a9abd46f175defafd29bc17ad456f398a7 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 12 Nov 2019 16:35:06 +0000 Subject: [PATCH 201/204] x86/kvm: Introduce kvm_(un)map_gfn() kvm_vcpu_(un)map operates on gfns from any current address space. In certain cases we want to make sure we are not mapping SMRAM and for that we can use kvm_(un)map_gfn() that we are introducing in this patch. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 538c25e778c07..0cb78f55b92c9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -775,8 +775,10 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); +int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); +int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 00268290dcbd8..9ef58a233a7c3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1821,12 +1821,13 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn, +static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, struct kvm_host_map *map) { kvm_pfn_t pfn; void *hva = NULL; struct page *page = KVM_UNMAPPED_PAGE; + struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); if (!map) return -EINVAL; @@ -1855,14 +1856,20 @@ static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn, return 0; } +int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +{ + return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map); +} +EXPORT_SYMBOL_GPL(kvm_map_gfn); + int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { - return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map); + return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map); } EXPORT_SYMBOL_GPL(kvm_vcpu_map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, - bool dirty) +static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, + struct kvm_host_map *map, bool dirty) { if (!map) return; @@ -1878,7 +1885,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, #endif if (dirty) { - kvm_vcpu_mark_page_dirty(vcpu, map->gfn); + mark_page_dirty_in_slot(memslot, map->gfn); kvm_release_pfn_dirty(map->pfn); } else { kvm_release_pfn_clean(map->pfn); @@ -1887,6 +1894,18 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, map->hva = NULL; map->page = NULL; } + +int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +{ + __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_unmap_gfn); + +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +{ + __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty); +} EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) From 917248144db5d7320655dbb41d3af0b8a0f3d589 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 5 Dec 2019 01:30:51 +0000 Subject: [PATCH 202/204] x86/kvm: Cache gfn to pfn translation __kvm_map_gfn()'s call to gfn_to_pfn_memslot() is * relatively expensive * in certain cases (such as when done from atomic context) cannot be called Stashing gfn-to-pfn mapping should help with both cases. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 10 ++++ include/linux/kvm_host.h | 7 ++- include/linux/kvm_types.h | 9 ++- virt/kvm/kvm_main.c | 98 ++++++++++++++++++++++++++------- 5 files changed, 103 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b79cd6aa40756..f48a306e1d665 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -689,6 +689,7 @@ struct kvm_vcpu_arch { u64 last_steal; struct gfn_to_hva_cache stime; struct kvm_steal_time steal; + struct gfn_to_pfn_cache cache; } st; u64 tsc_offset; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c9369151e9f3..0795bc876abcc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9088,6 +9088,9 @@ static void fx_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; + + kvm_release_pfn(cache->pfn, cache->dirty, cache); kvmclock_reset(vcpu); @@ -9761,11 +9764,18 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { + struct kvm_vcpu *vcpu; + int i; + /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ kvm_mmu_invalidate_mmio_sptes(kvm, gen); + + /* Force re-initialization of steal_time cache */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0cb78f55b92c9..71cb9cc105f0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -723,6 +723,7 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); void kvm_get_pfn(kvm_pfn_t pfn); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, @@ -775,10 +776,12 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map); +int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool atomic); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); +int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 1c88e69db3d9d..68e84cf42a3f9 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -18,7 +18,7 @@ struct kvm_memslots; enum kvm_mr_change; -#include +#include /* * Address types: @@ -51,4 +51,11 @@ struct gfn_to_hva_cache { struct kvm_memory_slot *memslot; }; +struct gfn_to_pfn_cache { + u64 generation; + gfn_t gfn; + kvm_pfn_t pfn; + bool dirty; +}; + #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9ef58a233a7c3..67eb302a7240a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1821,27 +1821,72 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) +{ + if (pfn == 0) + return; + + if (cache) + cache->pfn = cache->gfn = 0; + + if (dirty) + kvm_release_pfn_dirty(pfn); + else + kvm_release_pfn_clean(pfn); +} + +static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, + struct gfn_to_pfn_cache *cache, u64 gen) +{ + kvm_release_pfn(cache->pfn, cache->dirty, cache); + + cache->pfn = gfn_to_pfn_memslot(slot, gfn); + cache->gfn = gfn; + cache->dirty = false; + cache->generation = gen; +} + static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, - struct kvm_host_map *map) + struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, + bool atomic) { kvm_pfn_t pfn; void *hva = NULL; struct page *page = KVM_UNMAPPED_PAGE; struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); + u64 gen = slots->generation; if (!map) return -EINVAL; - pfn = gfn_to_pfn_memslot(slot, gfn); + if (cache) { + if (!cache->pfn || cache->gfn != gfn || + cache->generation != gen) { + if (atomic) + return -EAGAIN; + kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); + } + pfn = cache->pfn; + } else { + if (atomic) + return -EAGAIN; + pfn = gfn_to_pfn_memslot(slot, gfn); + } if (is_error_noslot_pfn(pfn)) return -EINVAL; if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - hva = kmap(page); + if (atomic) + hva = kmap_atomic(page); + else + hva = kmap(page); #ifdef CONFIG_HAS_IOMEM - } else { + } else if (!atomic) { hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); + } else { + return -EINVAL; #endif } @@ -1856,20 +1901,25 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, return 0; } -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool atomic) { - return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map); + return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, + cache, atomic); } EXPORT_SYMBOL_GPL(kvm_map_gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { - return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map); + return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, + NULL, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_map); static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, - struct kvm_host_map *map, bool dirty) + struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, + bool dirty, bool atomic) { if (!map) return; @@ -1877,34 +1927,44 @@ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) - kunmap(map->page); + if (map->page != KVM_UNMAPPED_PAGE) { + if (atomic) + kunmap_atomic(map->hva); + else + kunmap(map->page); + } #ifdef CONFIG_HAS_IOMEM - else + else if (!atomic) memunmap(map->hva); + else + WARN_ONCE(1, "Unexpected unmapping in atomic context"); #endif - if (dirty) { + if (dirty) mark_page_dirty_in_slot(memslot, map->gfn); - kvm_release_pfn_dirty(map->pfn); - } else { - kvm_release_pfn_clean(map->pfn); - } + + if (cache) + cache->dirty |= dirty; + else + kvm_release_pfn(map->pfn, dirty, NULL); map->hva = NULL; map->page = NULL; } -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) { - __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty); + __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, + cache, dirty, atomic); return 0; } EXPORT_SYMBOL_GPL(kvm_unmap_gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) { - __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty); + __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, + dirty, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); From b043138246a41064527cf019a3d51d9f015e9796 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 5 Dec 2019 03:45:32 +0000 Subject: [PATCH 203/204] x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed There is a potential race in record_steal_time() between setting host-local vcpu->arch.st.steal.preempted to zero (i.e. clearing KVM_VCPU_PREEMPTED) and propagating this value to the guest with kvm_write_guest_cached(). Between those two events the guest may still see KVM_VCPU_PREEMPTED in its copy of kvm_steal_time, set KVM_VCPU_FLUSH_TLB and assume that hypervisor will do the right thing. Which it won't. Instad of copying, we should map kvm_steal_time and that will guarantee atomicity of accesses to @preempted. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 51 +++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0795bc876abcc..f1845df7e7c32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2581,45 +2581,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) static void record_steal_time(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + /* -EAGAIN is returned in atomic context so we can just return. */ + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, + &map, &vcpu->arch.st.cache, false)) return; + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); - if (vcpu->arch.st.steal.version & 1) - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + vcpu->arch.st.steal.preempted = 0; - vcpu->arch.st.steal.version += 1; + if (st->version & 1) + st->version += 1; /* first time write, random junk */ - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + st->version += 1; smp_wmb(); - vcpu->arch.st.steal.steal += current->sched_info.run_delay - + st->steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - smp_wmb(); - vcpu->arch.st.steal.version += 1; + st->version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3501,18 +3503,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (vcpu->arch.st.steal.preempted) return; - vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, + &vcpu->arch.st.cache, true)) + return; + + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + + st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal.preempted, - offsetof(struct kvm_steal_time, preempted), - sizeof(vcpu->arch.st.steal.preempted)); + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) From a6bd811f1209fe1c64c9f6fd578101d6436c6b6e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 6 Dec 2019 15:36:12 +0000 Subject: [PATCH 204/204] x86/KVM: Clean up host's steal time structure Now that we are mapping kvm_steal_time from the guest directly we don't need keep a copy of it in kvm_vcpu_arch.st. The same is true for the stime field. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/x86.c | 11 +++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f48a306e1d665..4925bdbfb5160 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -685,10 +685,9 @@ struct kvm_vcpu_arch { bool pvclock_set_guest_stopped_request; struct { + u8 preempted; u64 msr_val; u64 last_steal; - struct gfn_to_hva_cache stime; - struct kvm_steal_time steal; struct gfn_to_pfn_cache cache; } st; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1845df7e7c32..a0381ec905cee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2604,7 +2604,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); - vcpu->arch.st.steal.preempted = 0; + vcpu->arch.st.preempted = 0; if (st->version & 1) st->version += 1; /* first time write, random junk */ @@ -2788,11 +2788,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS, - sizeof(struct kvm_steal_time))) - return 1; - vcpu->arch.st.msr_val = data; if (!(data & KVM_MSR_ENABLED)) @@ -3509,7 +3504,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (vcpu->arch.st.steal.preempted) + if (vcpu->arch.st.preempted) return; if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, @@ -3519,7 +3514,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); - st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); }