Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Browse files Browse the repository at this point in the history
Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Correctly clean the BSS to the PoC before allowing EL2 to access it
     on nVHE/hVHE/protected configurations

   - Propagate ownership of debug registers in protected mode after the
     rework that landed in 6.14-rc1

   - Stop pretending that we can run the protected mode without a GICv3
     being present on the host

   - Fix a use-after-free situation that can occur if a vcpu fails to
     initialise the NV shadow S2 MMU contexts

   - Always evaluate the need to arm a background timer for fully
     emulated guest timers

   - Fix the emulation of EL1 timers in the absence of FEAT_ECV

   - Correctly handle the EL2 virtual timer, specially when HCR_EL2.E2H==0

  s390:

   - move some of the guest page table (gmap) logic into KVM itself,
     inching towards the final goal of completely removing gmap from the
     non-kvm memory management code.

     As an initial set of cleanups, move some code from mm/gmap into kvm
     and start using __kvm_faultin_pfn() to fault-in pages as needed;
     but especially stop abusing page->index and page->lru to aid in the
     pgdesc conversion.

  x86:

   - Add missing check in the fix to defer starting the huge page
     recovery vhost_task

   - SRSO_USER_KERNEL_NO does not need SYNTHESIZED_F"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (31 commits)
  KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking
  KVM: remove kvm_arch_post_init_vm
  KVM: selftests: Fix spelling mistake "initally" -> "initially"
  kvm: x86: SRSO_USER_KERNEL_NO is not synthesized
  KVM: arm64: timer: Don't adjust the EL2 virtual timer offset
  KVM: arm64: timer: Correctly handle EL1 timer emulation when !FEAT_ECV
  KVM: arm64: timer: Always evaluate the need for a soft timer
  KVM: arm64: Fix nested S2 MMU structures reallocation
  KVM: arm64: Fail protected mode init if no vgic hardware is present
  KVM: arm64: Flush/sync debug state in protected mode
  KVM: s390: selftests: Streamline uc_skey test to issue iske after sske
  KVM: s390: remove the last user of page->index
  KVM: s390: move PGSTE softbits
  KVM: s390: remove useless page->index usage
  KVM: s390: move gmap_shadow_pgt_lookup() into kvm
  KVM: s390: stop using lists to keep track of used dat tables
  KVM: s390: stop using page->index for non-shadow gmaps
  KVM: s390: move some gmap shadowing functions away from mm/gmap.c
  KVM: s390: get rid of gmap_translate()
  KVM: s390: get rid of gmap_fault()
  ...
  • Loading branch information
Linus Torvalds committed Feb 9, 2025
2 parents 9946eaf + 43fb96a commit 954a209
Show file tree
Hide file tree
Showing 31 changed files with 1,093 additions and 1,007 deletions.
2 changes: 1 addition & 1 deletion Documentation/virt/kvm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1419,7 +1419,7 @@ fetch) is injected in the guest.
S390:
^^^^^

Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL or -EEXIST if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL if called on a protected VM.

4.36 KVM_SET_TSS_ADDR
Expand Down
49 changes: 11 additions & 38 deletions arch/arm64/kvm/arch_timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx)

trace_kvm_timer_emulate(ctx, should_fire);

if (should_fire != ctx->irq.level) {
if (should_fire != ctx->irq.level)
kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
return;
}

kvm_timer_update_status(ctx, should_fire);

Expand Down Expand Up @@ -761,21 +759,6 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
timer_irq(map->direct_ptimer),
&arch_timer_irq_ops);
WARN_ON_ONCE(ret);

/*
* The virtual offset behaviour is "interesting", as it
* always applies when HCR_EL2.E2H==0, but only when
* accessed from EL1 when HCR_EL2.E2H==1. So make sure we
* track E2H when putting the HV timer in "direct" mode.
*/
if (map->direct_vtimer == vcpu_hvtimer(vcpu)) {
struct arch_timer_offset *offs = &map->direct_vtimer->offset;

if (vcpu_el2_e2h_is_set(vcpu))
offs->vcpu_offset = NULL;
else
offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
}
}
}

Expand Down Expand Up @@ -976,31 +959,21 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
* which allows trapping of the timer registers even with NV2.
* Still, this is still worse than FEAT_NV on its own. Meh.
*/
if (!vcpu_el2_e2h_is_set(vcpu)) {
if (cpus_have_final_cap(ARM64_HAS_ECV))
return;

/*
* A non-VHE guest hypervisor doesn't have any direct access
* to its timers: the EL2 registers trap (and the HW is
* fully emulated), while the EL0 registers access memory
* despite the access being notionally direct. Boo.
*
* We update the hardware timer registers with the
* latest value written by the guest to the VNCR page
* and let the hardware take care of the rest.
*/
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL);
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL);
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL);
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL);
} else {
if (!cpus_have_final_cap(ARM64_HAS_ECV)) {
/*
* For a VHE guest hypervisor, the EL2 state is directly
* stored in the host EL1 timers, while the emulated EL0
* stored in the host EL1 timers, while the emulated EL1
* state is stored in the VNCR page. The latter could have
* been updated behind our back, and we must reset the
* emulation of the timers.
*
* A non-VHE guest hypervisor doesn't have any direct access
* to its timers: the EL2 registers trap despite being
* notionally direct (we use the EL1 HW, as for VHE), while
* the EL1 registers access memory.
*
* In both cases, process the emulated timers on each guest
* exit. Boo.
*/
struct timer_map map;
get_timer_map(vcpu, &map);
Expand Down
20 changes: 20 additions & 0 deletions arch/arm64/kvm/arm.c
Original file line number Diff line number Diff line change
Expand Up @@ -2290,6 +2290,19 @@ static int __init init_subsystems(void)
break;
case -ENODEV:
case -ENXIO:
/*
* No VGIC? No pKVM for you.
*
* Protected mode assumes that VGICv3 is present, so no point
* in trying to hobble along if vgic initialization fails.
*/
if (is_protected_kvm_enabled())
goto out;

/*
* Otherwise, userspace could choose to implement a GIC for its
* guest on non-cooperative hardware.
*/
vgic_present = false;
err = 0;
break;
Expand Down Expand Up @@ -2400,6 +2413,13 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
kvm_nvhe_sym(__icache_flags) = __icache_flags;
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;

/*
* Flush entire BSS since part of its data containing init symbols is read
* while the MMU is off.
*/
kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
}

static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
Expand Down
24 changes: 24 additions & 0 deletions arch/arm64/kvm/hyp/nvhe/hyp-main.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
}

static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
{
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;

hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner;

if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state;
else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state;
}

static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
{
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;

if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state;
else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state;
}

static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
{
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;

fpsimd_sve_flush();
flush_debug_state(hyp_vcpu);

hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;

Expand Down Expand Up @@ -123,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
unsigned int i;

fpsimd_sve_sync(&hyp_vcpu->vcpu);
sync_debug_state(hyp_vcpu);

host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;

Expand Down
9 changes: 5 additions & 4 deletions arch/arm64/kvm/nested.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
if (!tmp)
return -ENOMEM;

swap(kvm->arch.nested_mmus, tmp);

/*
* If we went through a realocation, adjust the MMU back-pointers in
* the previously initialised kvm_pgtable structures.
*/
if (kvm->arch.nested_mmus != tmp)
for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
tmp[i].pgt->mmu = &tmp[i];
kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];

for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
ret = init_nested_s2_mmu(kvm, &tmp[i]);
ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);

if (ret) {
for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
kvm_free_stage2_pgd(&tmp[i]);
kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);

return ret;
}

kvm->arch.nested_mmus_size = num_mmus;
kvm->arch.nested_mmus = tmp;

return 0;
}
Expand Down
16 changes: 13 additions & 3 deletions arch/arm64/kvm/sys_regs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}

static bool access_hv_timer(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
if (!vcpu_el2_e2h_is_set(vcpu))
return undef_access(vcpu, p, r);

return access_arch_timer(vcpu, p, r);
}

static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
s64 new, s64 cur)
{
Expand Down Expand Up @@ -3103,9 +3113,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0),
EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0),

{ SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer },
EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0),
EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0),
{ SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer },
EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0),
EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0),

{ SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },

Expand Down
20 changes: 6 additions & 14 deletions arch/s390/include/asm/gmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
/**
* struct gmap_struct - guest address space
* @list: list head for the mm->context gmap list
* @crst_list: list of all crst tables used in the guest address space
* @mm: pointer to the parent mm_struct
* @guest_to_host: radix tree with guest to host address translation
* @host_to_guest: radix tree with pointer to segment table entries
Expand All @@ -35,7 +34,6 @@
* @guest_handle: protected virtual machine handle for the ultravisor
* @host_to_rmap: radix tree with gmap_rmap lists
* @children: list of shadow gmap structures
* @pt_list: list of all page tables used in the shadow guest address space
* @shadow_lock: spinlock to protect the shadow gmap list
* @parent: pointer to the parent gmap for shadow guest address spaces
* @orig_asce: ASCE for which the shadow page table has been created
Expand All @@ -45,7 +43,6 @@
*/
struct gmap {
struct list_head list;
struct list_head crst_list;
struct mm_struct *mm;
struct radix_tree_root guest_to_host;
struct radix_tree_root host_to_guest;
Expand All @@ -61,7 +58,6 @@ struct gmap {
/* Additional data for shadow guest address spaces */
struct radix_tree_root host_to_rmap;
struct list_head children;
struct list_head pt_list;
spinlock_t shadow_lock;
struct gmap *parent;
unsigned long orig_asce;
Expand Down Expand Up @@ -106,23 +102,21 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
void gmap_remove(struct gmap *gmap);
struct gmap *gmap_get(struct gmap *gmap);
void gmap_put(struct gmap *gmap);
void gmap_free(struct gmap *gmap);
struct gmap *gmap_alloc(unsigned long limit);

int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long len);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
unsigned long gmap_translate(struct gmap *, unsigned long gaddr);
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
void __gmap_zap(struct gmap *, unsigned long gaddr);
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);

int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);

struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
int edat_level);
int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
void gmap_unshadow(struct gmap *sg);
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
int fake);
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
Expand All @@ -131,24 +125,22 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
int fake);
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
int fake);
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection, int *fake);
int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);

void gmap_register_pte_notifier(struct gmap_notifier *);
void gmap_unregister_pte_notifier(struct gmap_notifier *);

int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot);
int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits);

void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
int s390_disable_cow_sharing(void);
void s390_unlist_old_asce(struct gmap *gmap);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool interruptible);
int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split);
unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);

/**
* s390_uv_destroy_range - Destroy a range of pages in the given mm.
Expand Down
6 changes: 5 additions & 1 deletion arch/s390/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#define KVM_S390_ESCA_CPU_SLOTS 248
#define KVM_MAX_VCPUS 255

#define KVM_INTERNAL_MEM_SLOTS 1

/*
* These seem to be used for allocating ->chip in the routing table, which we
* don't use. 1 is as small as we can get to reduce the needed memory. If we
Expand Down Expand Up @@ -931,12 +933,14 @@ struct sie_page2 {
u8 reserved928[0x1000 - 0x928]; /* 0x0928 */
};

struct vsie_page;

struct kvm_s390_vsie {
struct mutex mutex;
struct radix_tree_root addr_to_page;
int page_count;
int next;
struct page *pages[KVM_MAX_VCPUS];
struct vsie_page *pages[KVM_MAX_VCPUS];
};

struct kvm_s390_gisa_iam {
Expand Down
21 changes: 18 additions & 3 deletions arch/s390/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,10 @@ void setup_protection_map(void);
#define PGSTE_HC_BIT 0x0020000000000000UL
#define PGSTE_GR_BIT 0x0004000000000000UL
#define PGSTE_GC_BIT 0x0002000000000000UL
#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
#define PGSTE_ST2_MASK 0x0000ffff00000000UL
#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */
#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */
#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */

/* Guest Page State used for virtualization */
#define _PGSTE_GPS_ZERO 0x0000000080000000UL
Expand Down Expand Up @@ -2007,4 +2008,18 @@ extern void s390_reset_cmma(struct mm_struct *mm);
#define pmd_pgtable(pmd) \
((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))

static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt)
{
unsigned long *pgstes, res;

pgstes = pgt + _PAGE_ENTRIES;

res = (pgstes[0] & PGSTE_ST2_MASK) << 16;
res |= pgstes[1] & PGSTE_ST2_MASK;
res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16;
res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32;

return res;
}

#endif /* _S390_PAGE_H */
6 changes: 3 additions & 3 deletions arch/s390/include/asm/uv.h
Original file line number Diff line number Diff line change
Expand Up @@ -628,12 +628,12 @@ static inline int is_prot_virt_host(void)
}

int uv_pin_shared(unsigned long paddr);
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
int uv_destroy_folio(struct folio *folio);
int uv_destroy_pte(pte_t pte);
int uv_convert_from_secure_pte(pte_t pte);
int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb);
int uv_convert_from_secure(unsigned long paddr);
int uv_convert_from_secure_folio(struct folio *folio);

void setup_uv(void);

Expand Down
Loading

0 comments on commit 954a209

Please sign in to comment.