From 1a9167a214f560a23c5050ce6dfebae489528f0d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 19 Jun 2019 13:01:27 -0300 Subject: [PATCH 01/13] KVM: PPC: Report single stepping capability When calling the KVM_SET_GUEST_DEBUG ioctl, userspace might request the next instruction to be single stepped via the KVM_GUESTDBG_SINGLESTEP control bit of the kvm_guest_debug structure. This patch adds the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability in order to inform userspace about the state of single stepping support. We currently don't have support for guest single stepping implemented in Book3S HV so the capability is only present for Book3S PR and BookE. Signed-off-by: Fabiano Rosas Signed-off-by: Paul Mackerras --- Documentation/virt/kvm/api.txt | 3 +++ arch/powerpc/kvm/powerpc.c | 2 ++ include/uapi/linux/kvm.h | 1 + 3 files changed, 6 insertions(+) diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index 4833904d32a5a..f94d06a12d208 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -2982,6 +2982,9 @@ can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number indicating the number of supported registers. +For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether +the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported. + When debug events exit the main run loop with the reason KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run structure containing architecture specific debug information. diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3a77bb6434521..9e085e931d749 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -522,6 +522,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; + case KVM_CAP_PPC_GUEST_DEBUG_SSTEP: + /* fall through */ case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e83..ce8cfcc51aecc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1000,6 +1000,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 +#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 #ifdef KVM_CAP_IRQ_ROUTING From 258ed7d02843052d127df2264c8b342276ced18a Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 23 Sep 2019 18:30:23 -0300 Subject: [PATCH 02/13] KVM: PPC: Reduce calls to get current->mm by storing the value locally Reduces the number of calls to get_current() in order to get the value of current->mm by doing it once and storing the value, since it is not supposed to change inside the same process). Signed-off-by: Leonardo Bras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 9a75f0e1933b5..f2b9aea43216e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -508,6 +508,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct vm_area_struct *vma; unsigned long rcbits; long mmio_update; + struct mm_struct *mm; if (kvm_is_radix(kvm)) return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); @@ -584,6 +585,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = false; pfn = 0; page = NULL; + mm = current->mm; pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ @@ -592,8 +594,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); if (npages < 1) { /* Check if it's an I/O mapping */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); + down_read(&mm->mmap_sem); + vma = find_vma(mm, hva); if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && (vma->vm_flags & VM_PFNMAP)) { pfn = vma->vm_pgoff + @@ -602,7 +604,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); write_ok = vma->vm_flags & VM_WRITE; } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); if (!pfn) goto out_put; } else { @@ -621,8 +623,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * hugepage split and collapse. */ local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, - hva, NULL, NULL); + ptep = find_current_mm_pte(mm->pgd, hva, NULL, NULL); if (ptep) { pte = kvmppc_read_update_linux_pte(ptep, 1); if (__pte_write(pte)) From f41c4989c8de1fa70aafe950abaf80c56a8b8712 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 23 Sep 2019 18:24:08 -0300 Subject: [PATCH 03/13] KVM: PPC: E500: Replace current->mm by kvm->mm Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500_mmu_host.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 321db0fdb9db8..425d138066457 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -355,9 +355,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (tlbsel == 1) { struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); + down_read(&kvm->mm->mmap_sem); - vma = find_vma(current->mm, hva); + vma = find_vma(kvm->mm, hva); if (vma && hva >= vma->vm_start && (vma->vm_flags & VM_PFNMAP)) { /* @@ -441,7 +441,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); } - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); } if (likely(!pfnmap)) { From e7d71c943040c23f2fd042033d319f56e84f845b Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 27 Sep 2019 13:53:38 +0200 Subject: [PATCH 04/13] KVM: PPC: Book3S HV: XIVE: Set kvm->arch.xive when VPs are allocated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we cannot allocate the XIVE VPs in OPAL, the creation of a XIVE or XICS-on-XIVE device is aborted as expected, but we leave kvm->arch.xive set forever since the release method isn't called in this case. Any subsequent tentative to create a XIVE or XICS-on-XIVE for this VM will thus always fail (DoS). This is a problem for QEMU since it destroys and re-creates these devices when the VM is reset: the VM would be restricted to using the much slower emulated XIVE or XICS forever. As an alternative to adding rollback, do not assign kvm->arch.xive before making sure the XIVE VPs are allocated in OPAL. Cc: stable@vger.kernel.org # v5.2 Fixes: 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy' method by a 'release' method") Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 11 +++++------ arch/powerpc/kvm/book3s_xive_native.c | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index a3f9c665bb5ba..baa740815b3cb 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -2005,6 +2005,10 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) pr_devel("Creating xive for partition\n"); + /* Already there ? */ + if (kvm->arch.xive) + return -EEXIST; + xive = kvmppc_xive_get_device(kvm, type); if (!xive) return -ENOMEM; @@ -2014,12 +2018,6 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) xive->kvm = kvm; mutex_init(&xive->lock); - /* Already there ? */ - if (kvm->arch.xive) - ret = -EEXIST; - else - kvm->arch.xive = xive; - /* We use the default queue size set by the host */ xive->q_order = xive_native_default_eq_shift(); if (xive->q_order < PAGE_SHIFT) @@ -2039,6 +2037,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) if (ret) return ret; + kvm->arch.xive = xive; return 0; } diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 78b906ffa0d2b..ebb4215baf453 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1081,7 +1081,6 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) dev->private = xive; xive->dev = dev; xive->kvm = kvm; - kvm->arch.xive = xive; mutex_init(&xive->mapping_lock); mutex_init(&xive->lock); @@ -1102,6 +1101,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) if (ret) return ret; + kvm->arch.xive = xive; return 0; } From 8a4e7597ba1e41030189b73cd7261f4383588d1d Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 27 Sep 2019 13:53:49 +0200 Subject: [PATCH 05/13] KVM: PPC: Book3S HV: XIVE: Show VP id in debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Print out the VP id of each connected vCPU, this allow to see: - the VP block base in which OPAL encodes information that may be useful when debugging - the packed vCPU id which may differ from the raw vCPU id if the latter is >= KVM_MAX_VCPUS (2048) Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 4 ++-- arch/powerpc/kvm/book3s_xive_native.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index baa740815b3cb..0b7859e40f664 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -2107,9 +2107,9 @@ static int xive_debug_show(struct seq_file *m, void *private) if (!xc) continue; - seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x" + seq_printf(m, "cpu server %#x VP:%#x CPPR:%#x HWCPPR:%#x" " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n", - xc->server_num, xc->cppr, xc->hw_cppr, + xc->server_num, xc->vp_id, xc->cppr, xc->hw_cppr, xc->mfrr, xc->pending, xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index ebb4215baf453..43a86858390a8 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1204,8 +1204,8 @@ static int xive_native_debug_show(struct seq_file *m, void *private) if (!xc) continue; - seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", - xc->server_num, + seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", + xc->server_num, xc->vp_id, vcpu->arch.xive_saved_state.nsr, vcpu->arch.xive_saved_state.cppr, vcpu->arch.xive_saved_state.ipb, From 8db29ea2391cc6f5b73cc9c04b2dee4409b9fc05 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 27 Sep 2019 13:53:55 +0200 Subject: [PATCH 06/13] KVM: PPC: Book3S HV: XIVE: Compute the VP id in a common helper Reduce code duplication by consolidating the checking of vCPU ids and VP ids to a common helper used by both legacy and native XIVE KVM devices. And explain the magic with a comment. Signed-off-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 42 +++++++++++++++++++++------ arch/powerpc/kvm/book3s_xive.h | 1 + arch/powerpc/kvm/book3s_xive_native.c | 11 ++----- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 0b7859e40f664..d84da9f6ee880 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1211,6 +1211,37 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xive_vcpu = NULL; } +static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) +{ + /* We have a block of KVM_MAX_VCPUS VPs. We just need to check + * raw vCPU ids are below the expected limit for this guest's + * core stride ; kvmppc_pack_vcpu_id() will pack them down to an + * index that can be safely used to compute a VP id that belongs + * to the VP block. + */ + return cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode; +} + +int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) +{ + u32 vp_id; + + if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) { + pr_devel("Out of bounds !\n"); + return -EINVAL; + } + + vp_id = kvmppc_xive_vp(xive, cpu); + if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { + pr_devel("Duplicate !\n"); + return -EEXIST; + } + + *vp = vp_id; + + return 0; +} + int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu) { @@ -1229,20 +1260,13 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { - pr_devel("Out of bounds !\n"); - return -EINVAL; - } /* We need to synchronize with queue provisioning */ mutex_lock(&xive->lock); - vp_id = kvmppc_xive_vp(xive, cpu); - if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { - pr_devel("Duplicate !\n"); - r = -EEXIST; + r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id); + if (r) goto bail; - } xc = kzalloc(sizeof(*xc), GFP_KERNEL); if (!xc) { diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index fe3ed50e08186..90cf6ec35a68f 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -296,6 +296,7 @@ int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, struct kvmppc_xive_vcpu *xc, int irq); +int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 43a86858390a8..5bb480b2aafd2 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -118,19 +118,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { - pr_devel("Out of bounds !\n"); - return -EINVAL; - } mutex_lock(&xive->lock); - vp_id = kvmppc_xive_vp(xive, server_num); - if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { - pr_devel("Duplicate !\n"); - rc = -EEXIST; + rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id); + if (rc) goto bail; - } xc = kzalloc(sizeof(*xc), GFP_KERNEL); if (!xc) { From 062cfab7069fcb55d77ad5552f29e24178728fa2 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 27 Sep 2019 13:54:01 +0200 Subject: [PATCH 07/13] KVM: PPC: Book3S HV: XIVE: Make VP block size configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XIVE VP is an internal structure which allow the XIVE interrupt controller to maintain the interrupt context state of vCPUs non dispatched on HW threads. When a guest is started, the XIVE KVM device allocates a block of XIVE VPs in OPAL, enough to accommodate the highest possible vCPU id KVM_MAX_VCPU_ID (16384) packed down to KVM_MAX_VCPUS (2048). With a guest's core stride of 8 and a threading mode of 1 (QEMU's default), a VM must run at least 256 vCPUs to actually need such a range of VPs. A POWER9 system has a limited XIVE VP space : 512k and KVM is currently wasting this HW resource with large VP allocations, especially since a typical VM likely runs with a lot less vCPUs. Make the size of the VP block configurable. Add an nr_servers field to the XIVE structure and a function to set it for this purpose. Split VP allocation out of the device create function. Since the VP block isn't used before the first vCPU connects to the XIVE KVM device, allocation is now performed by kvmppc_xive_connect_vcpu(). This gives the opportunity to set nr_servers in between: kvmppc_xive_create() / kvmppc_xive_native_create() . . kvmppc_xive_set_nr_servers() . . kvmppc_xive_connect_vcpu() / kvmppc_xive_native_connect_vcpu() The connect_vcpu() functions check that the vCPU id is below nr_servers and if it is the first vCPU they allocate the VP block. This is protected against a concurrent update of nr_servers by kvmppc_xive_set_nr_servers() with the xive->lock mutex. Also, the block is allocated once for the device lifetime: nr_servers should stay constant otherwise connect_vcpu() could generate a boggus VP id and likely crash OPAL. It is thus forbidden to update nr_servers once the block is allocated. If the VP allocation fail, return ENOSPC which seems more appropriate to report the depletion of system wide HW resource than ENOMEM or ENXIO. A VM using a stride of 8 and 1 thread per core with 32 vCPUs would hence only need 256 VPs instead of 2048. If the stride is set to match the number of threads per core, this goes further down to 32. This will be exposed to userspace by a subsequent patch. Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 65 ++++++++++++++++++++++----- arch/powerpc/kvm/book3s_xive.h | 4 ++ arch/powerpc/kvm/book3s_xive_native.c | 18 +++----- 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index d84da9f6ee880..6c35b3d959868 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1213,13 +1213,13 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) { - /* We have a block of KVM_MAX_VCPUS VPs. We just need to check + /* We have a block of xive->nr_servers VPs. We just need to check * raw vCPU ids are below the expected limit for this guest's * core stride ; kvmppc_pack_vcpu_id() will pack them down to an * index that can be safely used to compute a VP id that belongs * to the VP block. */ - return cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode; + return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode; } int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) @@ -1231,6 +1231,14 @@ int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) return -EINVAL; } + if (xive->vp_base == XIVE_INVALID_VP) { + xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers); + pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers); + + if (xive->vp_base == XIVE_INVALID_VP) + return -ENOSPC; + } + vp_id = kvmppc_xive_vp(xive, cpu); if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { pr_devel("Duplicate !\n"); @@ -1858,6 +1866,43 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, return 0; } +int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) +{ + u32 __user *ubufp = (u32 __user *) addr; + u32 nr_servers; + int rc = 0; + + if (get_user(nr_servers, ubufp)) + return -EFAULT; + + pr_devel("%s nr_servers=%u\n", __func__, nr_servers); + + if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID) + return -EINVAL; + + mutex_lock(&xive->lock); + if (xive->vp_base != XIVE_INVALID_VP) + /* The VP block is allocated once and freed when the device + * is released. Better not allow to change its size since its + * used by connect_vcpu to validate vCPU ids are valid (eg, + * setting it back to a higher value could allow connect_vcpu + * to come up with a VP id that goes beyond the VP block, which + * is likely to cause a crash in OPAL). + */ + rc = -EBUSY; + else if (nr_servers > KVM_MAX_VCPUS) + /* We don't need more servers. Higher vCPU ids get packed + * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id(). + */ + xive->nr_servers = KVM_MAX_VCPUS; + else + xive->nr_servers = nr_servers; + + mutex_unlock(&xive->lock); + + return rc; +} + static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { struct kvmppc_xive *xive = dev->private; @@ -2025,7 +2070,6 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) { struct kvmppc_xive *xive; struct kvm *kvm = dev->kvm; - int ret = 0; pr_devel("Creating xive for partition\n"); @@ -2049,18 +2093,15 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) else xive->q_page_order = xive->q_order - PAGE_SHIFT; - /* Allocate a bunch of VPs */ - xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS); - pr_devel("VP_Base=%x\n", xive->vp_base); - - if (xive->vp_base == XIVE_INVALID_VP) - ret = -ENOMEM; + /* VP allocation is delayed to the first call to connect_vcpu */ + xive->vp_base = XIVE_INVALID_VP; + /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets + * on a POWER9 system. + */ + xive->nr_servers = KVM_MAX_VCPUS; xive->single_escalation = xive_native_has_single_escalation(); - if (ret) - return ret; - kvm->arch.xive = xive; return 0; } diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 90cf6ec35a68f..382e3a56e7896 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -135,6 +135,9 @@ struct kvmppc_xive { /* Flags */ u8 single_escalation; + /* Number of entries in the VP block */ + u32 nr_servers; + struct kvmppc_xive_ops *ops; struct address_space *mapping; struct mutex mapping_lock; @@ -297,6 +300,7 @@ struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, struct kvmppc_xive_vcpu *xc, int irq); int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); +int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 5bb480b2aafd2..8ab333eabeef5 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1060,7 +1060,6 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) { struct kvmppc_xive *xive; struct kvm *kvm = dev->kvm; - int ret = 0; pr_devel("Creating xive native device\n"); @@ -1077,23 +1076,16 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) mutex_init(&xive->mapping_lock); mutex_init(&xive->lock); - /* - * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for - * a default. Getting the max number of CPUs the VM was - * configured with would improve our usage of the XIVE VP space. + /* VP allocation is delayed to the first call to connect_vcpu */ + xive->vp_base = XIVE_INVALID_VP; + /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets + * on a POWER9 system. */ - xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS); - pr_devel("VP_Base=%x\n", xive->vp_base); - - if (xive->vp_base == XIVE_INVALID_VP) - ret = -ENXIO; + xive->nr_servers = KVM_MAX_VCPUS; xive->single_escalation = xive_native_has_single_escalation(); xive->ops = &kvmppc_xive_native_ops; - if (ret) - return ret; - kvm->arch.xive = xive; return 0; } From efe5ddcae496b7c7307805d31815df23ba69bf7c Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 27 Sep 2019 13:54:07 +0200 Subject: [PATCH 08/13] KVM: PPC: Book3S HV: XIVE: Allow userspace to set the # of VPs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new attribute to both XIVE and XICS-on-XIVE KVM devices so that userspace can tell how many interrupt servers it needs. If a VM needs less than the current default of KVM_MAX_VCPUS (2048), we can allocate less VPs in OPAL. Combined with a core stride (VSMT) that matches the number of guest threads per core, this may substantially increases the number of VMs that can run concurrently with an in-kernel XIVE device. Since the legacy XIVE KVM device is exposed to userspace through the XICS KVM API, a new attribute group is added to it for this purpose. While here, fix the syntax of the existing KVM_DEV_XICS_GRP_SOURCES in the XICS documentation. Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- Documentation/virt/kvm/devices/xics.txt | 14 ++++++++++++-- Documentation/virt/kvm/devices/xive.txt | 8 ++++++++ arch/powerpc/include/uapi/asm/kvm.h | 3 +++ arch/powerpc/kvm/book3s_xive.c | 10 ++++++++++ arch/powerpc/kvm/book3s_xive_native.c | 3 +++ 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt index 42864935ac5d1..423332dda7bc8 100644 --- a/Documentation/virt/kvm/devices/xics.txt +++ b/Documentation/virt/kvm/devices/xics.txt @@ -3,9 +3,19 @@ XICS interrupt controller Device type supported: KVM_DEV_TYPE_XICS Groups: - KVM_DEV_XICS_SOURCES + 1. KVM_DEV_XICS_GRP_SOURCES Attributes: One per interrupt source, indexed by the source number. + 2. KVM_DEV_XICS_GRP_CTRL + Attributes: + 2.1 KVM_DEV_XICS_NR_SERVERS (write only) + The kvm_device_attr.addr points to a __u32 value which is the number of + interrupt server numbers (ie, highest possible vcpu id plus one). + Errors: + -EINVAL: Value greater than KVM_MAX_VCPU_ID. + -EFAULT: Invalid user pointer for attr->addr. + -EBUSY: A vcpu is already connected to the device. + This device emulates the XICS (eXternal Interrupt Controller Specification) defined in PAPR. The XICS has a set of interrupt sources, each identified by a 20-bit source number, and a set of @@ -38,7 +48,7 @@ least-significant end of the word: Each source has 64 bits of state that can be read and written using the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the -KVM_DEV_XICS_SOURCES attribute group, with the attribute number being +KVM_DEV_XICS_GRP_SOURCES attribute group, with the attribute number being the interrupt source number. The 64 bit state word has the following bitfields, starting from the least-significant end of the word: diff --git a/Documentation/virt/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt index 9a24a45252539..f5d1d6b5af615 100644 --- a/Documentation/virt/kvm/devices/xive.txt +++ b/Documentation/virt/kvm/devices/xive.txt @@ -78,6 +78,14 @@ the legacy interrupt mode, referred as XICS (POWER7/8). migrating the VM. Errors: none + 1.3 KVM_DEV_XIVE_NR_SERVERS (write only) + The kvm_device_attr.addr points to a __u32 value which is the number of + interrupt server numbers (ie, highest possible vcpu id plus one). + Errors: + -EINVAL: Value greater than KVM_MAX_VCPU_ID. + -EFAULT: Invalid user pointer for attr->addr. + -EBUSY: A vCPU is already connected to the device. + 2. KVM_DEV_XIVE_GRP_SOURCE (write only) Initializes a new source in the XIVE device and mask it. Attributes: diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index b0f72dea8b11a..264e266a85bf6 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -667,6 +667,8 @@ struct kvm_ppc_cpu_char { /* PPC64 eXternal Interrupt Controller Specification */ #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ +#define KVM_DEV_XICS_GRP_CTRL 2 +#define KVM_DEV_XICS_NR_SERVERS 1 /* Layout of 64-bit source attribute values */ #define KVM_XICS_DESTINATION_SHIFT 0 @@ -683,6 +685,7 @@ struct kvm_ppc_cpu_char { #define KVM_DEV_XIVE_GRP_CTRL 1 #define KVM_DEV_XIVE_RESET 1 #define KVM_DEV_XIVE_EQ_SYNC 2 +#define KVM_DEV_XIVE_NR_SERVERS 3 #define KVM_DEV_XIVE_GRP_SOURCE 2 /* 64-bit source identifier */ #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */ #define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */ diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6c35b3d959868..66858b7d3c6b4 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1911,6 +1911,11 @@ static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) switch (attr->group) { case KVM_DEV_XICS_GRP_SOURCES: return xive_set_source(xive, attr->attr, attr->addr); + case KVM_DEV_XICS_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_XICS_NR_SERVERS: + return kvmppc_xive_set_nr_servers(xive, attr->addr); + } } return -ENXIO; } @@ -1936,6 +1941,11 @@ static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) attr->attr < KVMPPC_XICS_NR_IRQS) return 0; break; + case KVM_DEV_XICS_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_XICS_NR_SERVERS: + return 0; + } } return -ENXIO; } diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 8ab333eabeef5..34bd123fa0241 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -921,6 +921,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev, return kvmppc_xive_reset(xive); case KVM_DEV_XIVE_EQ_SYNC: return kvmppc_xive_native_eq_sync(xive); + case KVM_DEV_XIVE_NR_SERVERS: + return kvmppc_xive_set_nr_servers(xive, attr->addr); } break; case KVM_DEV_XIVE_GRP_SOURCE: @@ -960,6 +962,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev, switch (attr->attr) { case KVM_DEV_XIVE_RESET: case KVM_DEV_XIVE_EQ_SYNC: + case KVM_DEV_XIVE_NR_SERVERS: return 0; } break; From 9ee6471eb9d43114ba4f0de3e0f483bf6fb2a906 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 2 Oct 2019 16:00:21 +1000 Subject: [PATCH 09/13] KVM: PPC: Book3S: Define and use SRR1_MSR_BITS Acked-by: Paul Mackerras Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/reg.h | 12 ++++++++++++ arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index b3cbb1136bce0..75c7e95a321b9 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -748,6 +748,18 @@ #define SPRN_USPRG7 0x107 /* SPRG7 userspace read */ #define SPRN_SRR0 0x01A /* Save/Restore Register 0 */ #define SPRN_SRR1 0x01B /* Save/Restore Register 1 */ + +#ifdef CONFIG_PPC_BOOK3S +/* + * Bits loaded from MSR upon interrupt. + * PPC (64-bit) bits 33-36,42-47 are interrupt dependent, the others are + * loaded from MSR. The exception is that SRESET and MCE do not always load + * bit 62 (RI) from MSR. Don't use PPC_BITMASK for this because 32-bit uses + * it. + */ +#define SRR1_MSR_BITS (~0x783f0000UL) +#endif + #define SRR1_ISI_NOPT 0x40000000 /* ISI: Not found in hash */ #define SRR1_ISI_N_OR_G 0x10000000 /* ISI: Access is no-exec or G */ #define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ec2547cc5ecbe..a2336c452905f 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -136,7 +136,7 @@ void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { kvmppc_unfixup_split_real(vcpu); kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); - kvmppc_set_srr1(vcpu, (kvmppc_get_msr(vcpu) & ~0x783f0000ul) | flags); + kvmppc_set_srr1(vcpu, (kvmppc_get_msr(vcpu) & SRR1_MSR_BITS) | flags); kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); vcpu->arch.mmu.reset_msr(vcpu); } diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index cdf30c6eaf542..dc97e5be76f61 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -1186,7 +1186,7 @@ static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, forward_to_l1: vcpu->arch.fault_dsisr = flags; if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { - vcpu->arch.shregs.msr &= ~0x783f0000ul; + vcpu->arch.shregs.msr &= SRR1_MSR_BITS; vcpu->arch.shregs.msr |= flags; } return RESUME_HOST; From 87a45e07a5abfec4d6b0e8356718f8919d0a3c20 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 2 Oct 2019 16:00:22 +1000 Subject: [PATCH 10/13] KVM: PPC: Book3S: Replace reset_msr mmu op with inject_interrupt arch op reset_msr sets the MSR for interrupt injection, but it's cleaner and more flexible to provide a single op to set both MSR and PC for the interrupt. Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/include/asm/kvm_ppc.h | 1 + arch/powerpc/kvm/book3s.c | 27 +------------------ arch/powerpc/kvm/book3s_32_mmu.c | 6 ----- arch/powerpc/kvm/book3s_64_mmu.c | 15 ----------- arch/powerpc/kvm/book3s_64_mmu_hv.c | 13 ---------- arch/powerpc/kvm/book3s_hv.c | 22 ++++++++++++++++ arch/powerpc/kvm/book3s_pr.c | 40 ++++++++++++++++++++++++++++- 8 files changed, 63 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 6fe6ad64cba57..4273e799203d2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -401,7 +401,6 @@ struct kvmppc_mmu { u32 (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum); int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data, bool iswrite); - void (*reset_msr)(struct kvm_vcpu *vcpu); void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large); int (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); u64 (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ee62776e5433c..d63f649fe713d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -271,6 +271,7 @@ struct kvmppc_ops { union kvmppc_one_reg *val); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); + void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a2336c452905f..58a59ee998e29 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -74,27 +74,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { - ulong pc = kvmppc_get_pc(vcpu); - ulong lr = kvmppc_get_lr(vcpu); - if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) - kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); - if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) - kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); - vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; - } -} -EXPORT_SYMBOL_GPL(kvmppc_unfixup_split_real); - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - if (!is_kvmppc_hv_enabled(vcpu->kvm)) - return to_book3s(vcpu)->hior; - return 0; -} - static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, unsigned long pending_now, unsigned long old_pending) { @@ -134,11 +113,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { - kvmppc_unfixup_split_real(vcpu); - kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); - kvmppc_set_srr1(vcpu, (kvmppc_get_msr(vcpu) & SRR1_MSR_BITS) | flags); - kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); - vcpu->arch.mmu.reset_msr(vcpu); + vcpu->kvm->arch.kvm_ops->inject_interrupt(vcpu, vec, flags); } static int kvmppc_book3s_vec2irqprio(unsigned int vec) diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 18f244aad7aaa..f21e73492ce3d 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -90,11 +90,6 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16); } -static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) -{ - kvmppc_set_msr(vcpu, 0); -} - static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, u32 sre, gva_t eaddr, bool primary) @@ -406,7 +401,6 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin; mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin; mmu->xlate = kvmppc_mmu_book3s_32_xlate; - mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr; mmu->tlbie = kvmppc_mmu_book3s_32_tlbie; mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid; mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 5f63a5f7f24f8..599133256a954 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -24,20 +24,6 @@ #define dprintk(X...) do { } while(0) #endif -static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) -{ - unsigned long msr = vcpu->arch.intr_msr; - unsigned long cur_msr = kvmppc_get_msr(vcpu); - - /* If transactional, change to suspend mode on IRQ delivery */ - if (MSR_TM_TRANSACTIONAL(cur_msr)) - msr |= MSR_TS_S; - else - msr |= cur_msr & MSR_TS_MASK; - - kvmppc_set_msr(vcpu, msr); -} - static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( struct kvm_vcpu *vcpu, gva_t eaddr) @@ -676,7 +662,6 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) mmu->slbie = kvmppc_mmu_book3s_64_slbie; mmu->slbia = kvmppc_mmu_book3s_64_slbia; mmu->xlate = kvmppc_mmu_book3s_64_xlate; - mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr; mmu->tlbie = kvmppc_mmu_book3s_64_tlbie; mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid; mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f2b9aea43216e..4c37e97c75a1d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -275,18 +275,6 @@ int kvmppc_mmu_hv_init(void) return 0; } -static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) -{ - unsigned long msr = vcpu->arch.intr_msr; - - /* If transactional, change to suspend mode on IRQ delivery */ - if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) - msr |= MSR_TS_S; - else - msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; - kvmppc_set_msr(vcpu, msr); -} - static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret) @@ -2162,7 +2150,6 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; - mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 709cf1fd4cf46..94a0a9911b275 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -338,6 +338,27 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } +static void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + unsigned long msr, pc, new_msr, new_pc; + + msr = kvmppc_get_msr(vcpu); + pc = kvmppc_get_pc(vcpu); + new_msr = vcpu->arch.intr_msr; + new_pc = vec; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(msr)) + new_msr |= MSR_TS_S; + else + new_msr |= msr & MSR_TS_MASK; + + kvmppc_set_srr0(vcpu, pc); + kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); + kvmppc_set_pc(vcpu, new_pc); + kvmppc_set_msr(vcpu, new_msr); +} + static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { /* @@ -5401,6 +5422,7 @@ static struct kvmppc_ops kvm_ops_hv = { .set_one_reg = kvmppc_set_one_reg_hv, .vcpu_load = kvmppc_core_vcpu_load_hv, .vcpu_put = kvmppc_core_vcpu_put_hv, + .inject_interrupt = kvmppc_inject_interrupt_hv, .set_msr = kvmppc_set_msr_hv, .vcpu_run = kvmppc_vcpu_run_hv, .vcpu_create = kvmppc_core_vcpu_create_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index cc65af8fe6f7e..ce4fcf76e53e9 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -90,7 +90,43 @@ static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu) kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS); } -void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu); +static void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { + ulong pc = kvmppc_get_pc(vcpu); + ulong lr = kvmppc_get_lr(vcpu); + if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); + if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); + vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; + } +} + +static void kvmppc_inject_interrupt_pr(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + unsigned long msr, pc, new_msr, new_pc; + + kvmppc_unfixup_split_real(vcpu); + + msr = kvmppc_get_msr(vcpu); + pc = kvmppc_get_pc(vcpu); + new_msr = vcpu->arch.intr_msr; + new_pc = to_book3s(vcpu)->hior + vec; + +#ifdef CONFIG_PPC_BOOK3S_64 + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(msr)) + new_msr |= MSR_TS_S; + else + new_msr |= msr & MSR_TS_MASK; +#endif + + kvmppc_set_srr0(vcpu, pc); + kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); + kvmppc_set_pc(vcpu, new_pc); + kvmppc_set_msr(vcpu, new_msr); +} static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) { @@ -1761,6 +1797,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, #else /* default to book3s_32 (750) */ vcpu->arch.pvr = 0x84202; + vcpu->arch.intr_msr = 0; #endif kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; @@ -2058,6 +2095,7 @@ static struct kvmppc_ops kvm_ops_pr = { .set_one_reg = kvmppc_set_one_reg_pr, .vcpu_load = kvmppc_core_vcpu_load_pr, .vcpu_put = kvmppc_core_vcpu_put_pr, + .inject_interrupt = kvmppc_inject_interrupt_pr, .set_msr = kvmppc_set_msr_pr, .vcpu_run = kvmppc_vcpu_run_pr, .vcpu_create = kvmppc_core_vcpu_create_pr, From 268f4ef9954cec198cd6772caadf453bcaed3e5a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 2 Oct 2019 16:00:23 +1000 Subject: [PATCH 11/13] KVM: PPC: Book3S HV: Reuse kvmppc_inject_interrupt for async guest delivery This consolidates the HV interrupt delivery logic into one place. Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s.h | 3 ++ arch/powerpc/kvm/book3s_hv.c | 43 ------------------ arch/powerpc/kvm/book3s_hv_builtin.c | 67 ++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 2ef1311a2a13e..3a46139859496 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -32,4 +32,7 @@ extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val); static inline void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) {} #endif +extern void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr); +extern void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); + #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 94a0a9911b275..c340d416dce3e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -133,7 +133,6 @@ static inline bool nesting_enabled(struct kvm *kvm) /* If set, the threads on each CPU core have to be in the same MMU mode */ static bool no_mixing_hpt_and_radix; -static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); /* @@ -338,39 +337,6 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } -static void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) -{ - unsigned long msr, pc, new_msr, new_pc; - - msr = kvmppc_get_msr(vcpu); - pc = kvmppc_get_pc(vcpu); - new_msr = vcpu->arch.intr_msr; - new_pc = vec; - - /* If transactional, change to suspend mode on IRQ delivery */ - if (MSR_TM_TRANSACTIONAL(msr)) - new_msr |= MSR_TS_S; - else - new_msr |= msr & MSR_TS_MASK; - - kvmppc_set_srr0(vcpu, pc); - kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); - kvmppc_set_pc(vcpu, new_pc); - kvmppc_set_msr(vcpu, new_msr); -} - -static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) -{ - /* - * Check for illegal transactional state bit combination - * and if we find it, force the TS field to a safe state. - */ - if ((msr & MSR_TS_MASK) == MSR_TS_MASK) - msr &= ~MSR_TS_MASK; - vcpu->arch.shregs.msr = msr; - kvmppc_end_cede(vcpu); -} - static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; @@ -2475,15 +2441,6 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) vcpu->arch.timer_running = 1; } -static void kvmppc_end_cede(struct kvm_vcpu *vcpu) -{ - vcpu->arch.ceded = 0; - if (vcpu->arch.timer_running) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - vcpu->arch.timer_running = 0; - } -} - extern int __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7c1909657b556..068bee941a71c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -755,6 +755,56 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) local_paca->kvm_hstate.kvm_split_mode = NULL; } +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.ceded = 0; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } +} + +void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) +{ + /* + * Check for illegal transactional state bit combination + * and if we find it, force the TS field to a safe state. + */ + if ((msr & MSR_TS_MASK) == MSR_TS_MASK) + msr &= ~MSR_TS_MASK; + vcpu->arch.shregs.msr = msr; + kvmppc_end_cede(vcpu); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr_hv); + +static void inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + unsigned long msr, pc, new_msr, new_pc; + + msr = kvmppc_get_msr(vcpu); + pc = kvmppc_get_pc(vcpu); + new_msr = vcpu->arch.intr_msr; + new_pc = vec; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(msr)) + new_msr |= MSR_TS_S; + else + new_msr |= msr & MSR_TS_MASK; + + kvmppc_set_srr0(vcpu, pc); + kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); + kvmppc_set_pc(vcpu, new_pc); + vcpu->arch.shregs.msr = new_msr; +} + +void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + inject_interrupt(vcpu, vec, srr1_flags); + kvmppc_end_cede(vcpu); +} +EXPORT_SYMBOL_GPL(kvmppc_inject_interrupt_hv); + /* * Is there a PRIV_DOORBELL pending for the guest (on POWER9)? * Can we inject a Decrementer or a External interrupt? @@ -762,7 +812,6 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) { int ext; - unsigned long vec = 0; unsigned long lpcr; /* Insert EXTERNAL bit into LPCR at the MER bit position */ @@ -774,26 +823,16 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) if (vcpu->arch.shregs.msr & MSR_EE) { if (ext) { - vec = BOOK3S_INTERRUPT_EXTERNAL; + inject_interrupt(vcpu, BOOK3S_INTERRUPT_EXTERNAL, 0); } else { long int dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) dec = (int) dec; if (dec < 0) - vec = BOOK3S_INTERRUPT_DECREMENTER; + inject_interrupt(vcpu, + BOOK3S_INTERRUPT_DECREMENTER, 0); } } - if (vec) { - unsigned long msr, old_msr = vcpu->arch.shregs.msr; - - kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); - kvmppc_set_srr1(vcpu, old_msr); - kvmppc_set_pc(vcpu, vec); - msr = vcpu->arch.intr_msr; - if (MSR_TM_ACTIVE(old_msr)) - msr |= MSR_TS_S; - vcpu->arch.shregs.msr = msr; - } if (vcpu->arch.doorbell_request) { mtspr(SPRN_DPDES, 1); From 6a13cb0c376abb436d060b989018257963656d0c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 2 Oct 2019 16:00:24 +1000 Subject: [PATCH 12/13] KVM: PPC: Book3S HV: Implement LPCR[AIL]=3 mode for injected interrupts kvmppc_inject_interrupt does not implement LPCR[AIL]!=0 modes, which can result in the guest receiving interrupts as if LPCR[AIL]=0 contrary to the ISA. In practice, Linux guests cope with this deviation, but it should be fixed. Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_builtin.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 068bee941a71c..7cd3cf3d366b7 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -792,6 +792,21 @@ static void inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) else new_msr |= msr & MSR_TS_MASK; + /* + * Perform MSR and PC adjustment for LPCR[AIL]=3 if it is set and + * applicable. AIL=2 is not supported. + * + * AIL does not apply to SRESET, MCE, or HMI (which is never + * delivered to the guest), and does not apply if IR=0 or DR=0. + */ + if (vec != BOOK3S_INTERRUPT_SYSTEM_RESET && + vec != BOOK3S_INTERRUPT_MACHINE_CHECK && + (vcpu->arch.vcore->lpcr & LPCR_AIL) == LPCR_AIL_3 && + (msr & (MSR_IR|MSR_DR)) == (MSR_IR|MSR_DR) ) { + new_msr |= MSR_IR | MSR_DR; + new_pc += 0xC000000000004000ULL; + } + kvmppc_set_srr0(vcpu, pc); kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); kvmppc_set_pc(vcpu, new_pc); From 55d7004299eb917767761f01a208d50afad4f535 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 2 Oct 2019 16:00:25 +1000 Subject: [PATCH 13/13] KVM: PPC: Book3S HV: Reject mflags=2 (LPCR[AIL]=2) ADDR_TRANS_MODE mode AIL=2 mode has no known users, so is not well tested or supported. Disallow guests from selecting this mode because it may become deprecated in future versions of the architecture. This policy decision is not left to QEMU because KVM support is required for AIL=2 (when injecting interrupts). Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c340d416dce3e..ec5c0379296a0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -779,6 +779,11 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, vcpu->arch.dawr = value1; vcpu->arch.dawrx = value2; return H_SUCCESS; + case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: + /* KVM does not support mflags=2 (AIL=2) */ + if (mflags != 0 && mflags != 3) + return H_UNSUPPORTED_FLAG_START; + return H_TOO_HARD; default: return H_TOO_HARD; }