Skip to content

Commit

Permalink
Merge branch kvm-arm64/nv-timers into kvmarm-master/next
Browse files Browse the repository at this point in the history
* kvm-arm64/nv-timers:
  : .
  : Nested Virt support for the EL2 timers. From the initial cover letter:
  :
  : "Here's another batch of NV-related patches, this time bringing in most
  : of the timer support for EL2 as well as nested guests.
  :
  : The code is pretty convoluted for a bunch of reasons:
  :
  : - FEAT_NV2 breaks the timer semantics by redirecting HW controls to
  :   memory, meaning that a guest could setup a timer and never see it
  :   firing until the next exit
  :
  : - We go try hard to reflect the timer state in memory, but that's not
  :   great.
  :
  : - With FEAT_ECV, we can finally correctly emulate the virtual timer,
  :   but this emulation is pretty costly
  :
  : - As a way to make things suck less, we handle timer reads as early as
  :   possible, and only defer writes to the normal trap handling
  :
  : - Finally, some implementations are badly broken, and require some
  :   hand-holding, irrespective of NV support. So we try and reuse the NV
  :   infrastructure to make them usable. This could be further optimised,
  :   but I'm running out of patience for this sort of HW.
  :
  : [...]"
  : .
  KVM: arm64: nv: Fix doc header layout for timers
  KVM: arm64: nv: Document EL2 timer API
  KVM: arm64: Work around x1e's CNTVOFF_EL2 bogosity
  KVM: arm64: nv: Sanitise CNTHCTL_EL2
  KVM: arm64: nv: Propagate CNTHCTL_EL2.EL1NV{P,V}CT bits
  KVM: arm64: nv: Add trap routing for CNTHCTL_EL2.EL1{NVPCT,NVVCT,TVT,TVCT}
  KVM: arm64: Handle counter access early in non-HYP context
  KVM: arm64: nv: Accelerate EL0 counter accesses from hypervisor context
  KVM: arm64: nv: Accelerate EL0 timer read accesses when FEAT_ECV in use
  KVM: arm64: nv: Use FEAT_ECV to trap access to EL0 timers
  KVM: arm64: nv: Publish emulated timer interrupt state in the in-memory state
  KVM: arm64: nv: Sync nested timer state with FEAT_NV2
  KVM: arm64: nv: Add handling of EL2-specific timer registers

Signed-off-by: Marc Zyngier <maz@kernel.org>
  • Loading branch information
Marc Zyngier committed Jan 17, 2025
2 parents e880b16 + 5447863 commit 080612b
Show file tree
Hide file tree
Showing 17 changed files with 579 additions and 47 deletions.
14 changes: 8 additions & 6 deletions Documentation/virt/kvm/devices/vcpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ the cpu field to the processor id.

:Architectures: ARM64

2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER
-----------------------------------------------------------------------------
2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_{VTIMER,PTIMER,HVTIMER,HPTIMER}
-----------------------------------------------------------------------

:Parameters: in kvm_device_attr.addr the address for the timer interrupt is a
pointer to an int
Expand All @@ -159,10 +159,12 @@ A value describing the architected timer interrupt number when connected to an
in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the
attribute overrides the default values (see below).

============================= ==========================================
KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27)
KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30)
============================= ==========================================
============================== ==========================================
KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27)
KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30)
KVM_ARM_VCPU_TIMER_IRQ_HVTIMER The EL2 virtual timer intid (default: 28)
KVM_ARM_VCPU_TIMER_IRQ_HPTIMER The EL2 physical timer intid (default: 26)
============================== ==========================================

Setting the same PPI for different timers will prevent the VCPUs from running.
Setting the interrupt number on a VCPU configures all VCPUs created at that
Expand Down
2 changes: 2 additions & 0 deletions arch/arm64/include/asm/cputype.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
#define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
#define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
#define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
#define QCOM_CPU_PART_ORYON_X1 0x001

#define NVIDIA_CPU_PART_DENVER 0x003
#define NVIDIA_CPU_PART_CARMEL 0x004
Expand Down Expand Up @@ -198,6 +199,7 @@
#define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER)
#define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD)
#define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER)
#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1)
#define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER)
#define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL)
#define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX)
Expand Down
2 changes: 1 addition & 1 deletion arch/arm64/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,6 @@ enum vcpu_sysreg {
VBAR_EL2, /* Vector Base Address Register (EL2) */
RVBAR_EL2, /* Reset Vector Base Address Register */
CONTEXTIDR_EL2, /* Context ID Register (EL2) */
CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */
SP_EL2, /* EL2 Stack Pointer */
CNTHP_CTL_EL2,
CNTHP_CVAL_EL2,
Expand All @@ -504,6 +503,7 @@ enum vcpu_sysreg {
MARKER(__SANITISED_REG_START__),
TCR2_EL2, /* Extended Translation Control Register (EL2) */
MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */
CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */

/* Any VNCR-capable reg goes after this point */
MARKER(__VNCR_START__),
Expand Down
4 changes: 4 additions & 0 deletions arch/arm64/include/asm/sysreg.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,21 +477,25 @@
#define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0)

#define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1)
#define SYS_CNTVCT_EL0 sys_reg(3, 3, 14, 0, 2)
#define SYS_CNTPCTSS_EL0 sys_reg(3, 3, 14, 0, 5)
#define SYS_CNTVCTSS_EL0 sys_reg(3, 3, 14, 0, 6)

#define SYS_CNTP_TVAL_EL0 sys_reg(3, 3, 14, 2, 0)
#define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1)
#define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2)

#define SYS_CNTV_TVAL_EL0 sys_reg(3, 3, 14, 3, 0)
#define SYS_CNTV_CTL_EL0 sys_reg(3, 3, 14, 3, 1)
#define SYS_CNTV_CVAL_EL0 sys_reg(3, 3, 14, 3, 2)

#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0)
#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1)
#define SYS_AARCH32_CNTPCT sys_reg(0, 0, 0, 14, 0)
#define SYS_AARCH32_CNTVCT sys_reg(0, 1, 0, 14, 0)
#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0)
#define SYS_AARCH32_CNTPCTSS sys_reg(0, 8, 0, 14, 0)
#define SYS_AARCH32_CNTVCTSS sys_reg(0, 9, 0, 14, 0)

#define __PMEV_op2(n) ((n) & 0x7)
#define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3))
Expand Down
8 changes: 8 additions & 0 deletions arch/arm64/kernel/cpu_errata.c
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list),
},
#endif
{
.desc = "Broken CNTVOFF_EL2",
.capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF,
ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) {
MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1),
{}
})),
},
{
}
};
3 changes: 3 additions & 0 deletions arch/arm64/kernel/image-vars.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors);
KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);

/* Static key which is set if CNTVOFF_EL2 is unusable */
KVM_NVHE_ALIAS(broken_cntvoff_key);

/* EL2 exception handling */
KVM_NVHE_ALIAS(__start___kvm_ex_table);
KVM_NVHE_ALIAS(__stop___kvm_ex_table);
Expand Down
179 changes: 159 additions & 20 deletions arch/arm64/kvm/arch_timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ static u32 host_vtimer_irq_flags;
static u32 host_ptimer_irq_flags;

static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key);

static const u8 default_ppi[] = {
[TIMER_PTIMER] = 30,
Expand Down Expand Up @@ -101,21 +102,6 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
}
}

static u64 timer_get_offset(struct arch_timer_context *ctxt)
{
u64 offset = 0;

if (!ctxt)
return 0;

if (ctxt->offset.vm_offset)
offset += *ctxt->offset.vm_offset;
if (ctxt->offset.vcpu_offset)
offset += *ctxt->offset.vcpu_offset;

return offset;
}

static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
{
struct kvm_vcpu *vcpu = ctxt->vcpu;
Expand Down Expand Up @@ -441,11 +427,30 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu)
regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
}

static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level)
{
/*
* Paper over NV2 brokenness by publishing the interrupt status
* bit. This still results in a poor quality of emulation (guest
* writes will have no effect until the next exit).
*
* But hey, it's fast, right?
*/
if (is_hyp_ctxt(ctx->vcpu) &&
(ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) {
unsigned long val = timer_get_ctl(ctx);
__assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level);
timer_set_ctl(ctx, val);
}
}

static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
struct arch_timer_context *timer_ctx)
{
int ret;

kvm_timer_update_status(timer_ctx, new_level);

timer_ctx->irq.level = new_level;
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx),
timer_ctx->irq.level);
Expand All @@ -471,6 +476,8 @@ static void timer_emulate(struct arch_timer_context *ctx)
return;
}

kvm_timer_update_status(ctx, should_fire);

/*
* If the timer can fire now, we don't need to have a soft timer
* scheduled for the future. If the timer cannot fire at all,
Expand Down Expand Up @@ -513,7 +520,12 @@ static void timer_save_state(struct arch_timer_context *ctx)
case TIMER_VTIMER:
case TIMER_HVTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL));
timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL));
cval = read_sysreg_el0(SYS_CNTV_CVAL);

if (has_broken_cntvoff())
cval -= timer_get_offset(ctx);

timer_set_cval(ctx, cval);

/* Disable the timer */
write_sysreg_el0(0, SYS_CNTV_CTL);
Expand Down Expand Up @@ -618,8 +630,15 @@ static void timer_restore_state(struct arch_timer_context *ctx)

case TIMER_VTIMER:
case TIMER_HVTIMER:
set_cntvoff(timer_get_offset(ctx));
write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL);
cval = timer_get_cval(ctx);
offset = timer_get_offset(ctx);
if (has_broken_cntvoff()) {
set_cntvoff(0);
cval += offset;
} else {
set_cntvoff(offset);
}
write_sysreg_el0(cval, SYS_CNTV_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
break;
Expand Down Expand Up @@ -762,7 +781,7 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,

static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
{
bool tpt, tpc;
bool tvt, tpt, tvc, tpc, tvt02, tpt02;
u64 clr, set;

/*
Expand All @@ -777,7 +796,29 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
* within this function, reality kicks in and we start adding
* traps based on emulation requirements.
*/
tpt = tpc = false;
tvt = tpt = tvc = tpc = false;
tvt02 = tpt02 = false;

/*
* NV2 badly breaks the timer semantics by redirecting accesses to
* the EL1 timer state to memory, so let's call ECV to the rescue if
* available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses.
*
* The treatment slightly varies depending whether we run a nVHE or
* VHE guest: nVHE will use the _EL0 registers directly, while VHE
* will use the _EL02 accessors. This translates in different trap
* bits.
*
* None of the trapping is required when running in non-HYP context,
* unless required by the L1 hypervisor settings once we advertise
* ECV+NV in the guest, or that we need trapping for other reasons.
*/
if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) {
if (vcpu_el2_e2h_is_set(vcpu))
tvt02 = tpt02 = true;
else
tvt = tpt = true;
}

/*
* We have two possibility to deal with a physical offset:
Expand All @@ -792,10 +833,21 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
if (!has_cntpoff() && timer_get_offset(map->direct_ptimer))
tpt = tpc = true;

/*
* For the poor sods that could not correctly substract one value
* from another, trap the full virtual timer and counter.
*/
if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer))
tvt = tvc = true;

/*
* Apply the enable bits that the guest hypervisor has requested for
* its own guest. We can only add traps that wouldn't have been set
* above.
* Implementation choices: we do not support NV when E2H=0 in the
* guest, and we don't support configuration where E2H is writable
* by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but
* not both). This simplifies the handling of the EL1NV* bits.
*/
if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
Expand All @@ -806,6 +858,9 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)

tpt |= !(val & (CNTHCTL_EL1PCEN << 10));
tpc |= !(val & (CNTHCTL_EL1PCTEN << 10));

tpt02 |= (val & CNTHCTL_EL1NVPCT);
tvt02 |= (val & CNTHCTL_EL1NVVCT);
}

/*
Expand All @@ -817,6 +872,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)

assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr);
assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr);
assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set);
assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set);
assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set);
assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set);

/* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */
sysreg_clear_set(cnthctl_el2, clr, set);
Expand Down Expand Up @@ -905,6 +964,54 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
kvm_timer_blocking(vcpu);
}

void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
{
/*
* When NV2 is on, guest hypervisors have their EL1 timer register
* accesses redirected to the VNCR page. Any guest action taken on
* the timer is postponed until the next exit, leading to a very
* poor quality of emulation.
*
* This is an unmitigated disaster, only papered over by FEAT_ECV,
* which allows trapping of the timer registers even with NV2.
* Still, this is still worse than FEAT_NV on its own. Meh.
*/
if (!vcpu_el2_e2h_is_set(vcpu)) {
if (cpus_have_final_cap(ARM64_HAS_ECV))
return;

/*
* A non-VHE guest hypervisor doesn't have any direct access
* to its timers: the EL2 registers trap (and the HW is
* fully emulated), while the EL0 registers access memory
* despite the access being notionally direct. Boo.
*
* We update the hardware timer registers with the
* latest value written by the guest to the VNCR page
* and let the hardware take care of the rest.
*/
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL);
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL);
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL);
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL);
} else {
/*
* For a VHE guest hypervisor, the EL2 state is directly
* stored in the host EL1 timers, while the emulated EL0
* state is stored in the VNCR page. The latter could have
* been updated behind our back, and we must reset the
* emulation of the timers.
*/
struct timer_map map;
get_timer_map(vcpu, &map);

soft_timer_cancel(&map.emul_vtimer->hrtimer);
soft_timer_cancel(&map.emul_ptimer->hrtimer);
timer_emulate(map.emul_vtimer);
timer_emulate(map.emul_ptimer);
}
}

/*
* With a userspace irqchip we have to check if the guest de-asserted the
* timer and if so, unmask the timer irq signal on the host interrupt
Expand Down Expand Up @@ -1363,6 +1470,37 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info)
return 0;
}

static void kvm_timer_handle_errata(void)
{
u64 mmfr0, mmfr1, mmfr4;

/*
* CNTVOFF_EL2 is broken on some implementations. For those, we trap
* all virtual timer/counter accesses, requiring FEAT_ECV.
*
* However, a hypervisor supporting nesting is likely to mitigate the
* erratum at L0, and not require other levels to mitigate it (which
* would otherwise be a terrible performance sink due to trap
* amplification).
*
* Given that the affected HW implements both FEAT_VHE and FEAT_E2H0,
* and that NV is likely not to (because of limitations of the
* architecture), only enable the workaround when FEAT_VHE and
* FEAT_E2H0 are both detected. Time will tell if this actually holds.
*/
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1);
if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) &&
!SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) &&
SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) &&
(has_vhe() || has_hvhe()) &&
cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) {
static_branch_enable(&broken_cntvoff_key);
kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n");
}
}

int __init kvm_timer_hyp_init(bool has_gic)
{
struct arch_timer_kvm_info *info;
Expand Down Expand Up @@ -1431,6 +1569,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
goto out_free_vtimer_irq;
}

kvm_timer_handle_errata();
return 0;

out_free_ptimer_irq:
Expand Down
Loading

0 comments on commit 080612b

Please sign in to comment.