Skip to content

Commit

Permalink
KVM: Register cpuhp and syscore callbacks when enabling hardware
Browse files Browse the repository at this point in the history
Register KVM's cpuhp and syscore callback when enabling virtualization
in hardware instead of registering the callbacks during initialization,
and let the CPU up/down framework invoke the inner enable/disable
functions.  Registering the callbacks during initialization makes things
more complex than they need to be, as KVM needs to be very careful about
handling races between enabling CPUs being onlined/offlined and hardware
being enabled/disabled.

Intel TDX support will require KVM to enable virtualization during KVM
initialization, i.e. will add another wrinkle to things, at which point
sorting out the potential races with kvm_usage_count would become even
more complex.

Note, using the cpuhp framework has a subtle behavioral change: enabling
will be done serially across all CPUs, whereas KVM currently sends an IPI
to all CPUs in parallel.  While serializing virtualization enabling could
create undesirable latency, the issue is limited to the 0=>1 transition of
VM creation.  And even that can be mitigated, e.g. by letting userspace
force virtualization to be enabled when KVM is initialized.

Cc: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Acked-by: Kai Huang <kai.huang@intel.com>
Tested-by: Farrah Chen <farrah.chen@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-ID: <20240830043600.127750-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
  • Loading branch information
Sean Christopherson authored and Paolo Bonzini committed Sep 4, 2024
1 parent 44d1745 commit 9a798b1
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 117 deletions.
9 changes: 5 additions & 4 deletions Documentation/virt/kvm/locking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ KVM Lock Overview

The acquisition orders for mutexes are as follows:

- cpus_read_lock() is taken outside kvm_lock and kvm_usage_lock
- cpus_read_lock() is taken outside kvm_lock

- kvm_usage_lock is taken outside cpus_read_lock()

- kvm->lock is taken outside vcpu->mutex

Expand Down Expand Up @@ -241,9 +243,8 @@ time it will be set using the Dirty tracking mechanism described above.
:Arch: any
:Protects: - kvm_usage_count
- hardware virtualization enable/disable
:Comment: Exists because using kvm_lock leads to deadlock (see earlier comment
on cpus_read_lock() vs kvm_lock). Note, KVM also disables CPU hotplug via
cpus_read_lock() when enabling/disabling virtualization.
:Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is
protected, which simplifies the virtualization enabling logic.

``kvm->mn_invalidate_lock``
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
174 changes: 61 additions & 113 deletions virt/kvm/kvm_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -5578,7 +5578,7 @@ static DEFINE_PER_CPU(bool, hardware_enabled);
static DEFINE_MUTEX(kvm_usage_lock);
static int kvm_usage_count;

static int __hardware_enable_nolock(void)
static int hardware_enable_nolock(void)
{
if (__this_cpu_read(hardware_enabled))
return 0;
Expand All @@ -5593,34 +5593,18 @@ static int __hardware_enable_nolock(void)
return 0;
}

static void hardware_enable_nolock(void *failed)
{
if (__hardware_enable_nolock())
atomic_inc(failed);
}

static int kvm_online_cpu(unsigned int cpu)
{
int ret = 0;

/*
* Abort the CPU online process if hardware virtualization cannot
* be enabled. Otherwise running VMs would encounter unrecoverable
* errors when scheduled to this CPU.
*/
mutex_lock(&kvm_usage_lock);
if (kvm_usage_count)
ret = __hardware_enable_nolock();
mutex_unlock(&kvm_usage_lock);
return ret;
return hardware_enable_nolock();
}

static void hardware_disable_nolock(void *junk)
{
/*
* Note, hardware_disable_all_nolock() tells all online CPUs to disable
* hardware, not just CPUs that successfully enabled hardware!
*/
if (!__this_cpu_read(hardware_enabled))
return;

Expand All @@ -5631,78 +5615,10 @@ static void hardware_disable_nolock(void *junk)

static int kvm_offline_cpu(unsigned int cpu)
{
mutex_lock(&kvm_usage_lock);
if (kvm_usage_count)
hardware_disable_nolock(NULL);
mutex_unlock(&kvm_usage_lock);
hardware_disable_nolock(NULL);
return 0;
}

static void hardware_disable_all_nolock(void)
{
BUG_ON(!kvm_usage_count);

kvm_usage_count--;
if (!kvm_usage_count)
on_each_cpu(hardware_disable_nolock, NULL, 1);
}

static void hardware_disable_all(void)
{
cpus_read_lock();
mutex_lock(&kvm_usage_lock);
hardware_disable_all_nolock();
mutex_unlock(&kvm_usage_lock);
cpus_read_unlock();
}

static int hardware_enable_all(void)
{
atomic_t failed = ATOMIC_INIT(0);
int r;

/*
* Do not enable hardware virtualization if the system is going down.
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
* possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
* after kvm_reboot() is called. Note, this relies on system_state
* being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
* hook instead of registering a dedicated reboot notifier (the latter
* runs before system_state is updated).
*/
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
system_state == SYSTEM_RESTART)
return -EBUSY;

/*
* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
* is called, and so on_each_cpu() between them includes the CPU that
* is being onlined. As a result, hardware_enable_nolock() may get
* invoked before kvm_online_cpu(), which also enables hardware if the
* usage count is non-zero. Disable CPU hotplug to avoid attempting to
* enable hardware multiple times.
*/
cpus_read_lock();
mutex_lock(&kvm_usage_lock);

r = 0;

kvm_usage_count++;
if (kvm_usage_count == 1) {
on_each_cpu(hardware_enable_nolock, &failed, 1);

if (atomic_read(&failed)) {
hardware_disable_all_nolock();
r = -EBUSY;
}
}

mutex_unlock(&kvm_usage_lock);
cpus_read_unlock();

return r;
}

static void kvm_shutdown(void)
{
/*
Expand Down Expand Up @@ -5734,8 +5650,7 @@ static int kvm_suspend(void)
lockdep_assert_not_held(&kvm_usage_lock);
lockdep_assert_irqs_disabled();

if (kvm_usage_count)
hardware_disable_nolock(NULL);
hardware_disable_nolock(NULL);
return 0;
}

Expand All @@ -5744,15 +5659,68 @@ static void kvm_resume(void)
lockdep_assert_not_held(&kvm_usage_lock);
lockdep_assert_irqs_disabled();

if (kvm_usage_count)
WARN_ON_ONCE(__hardware_enable_nolock());
WARN_ON_ONCE(hardware_enable_nolock());
}

static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
.shutdown = kvm_shutdown,
};

static int hardware_enable_all(void)
{
int r;

guard(mutex)(&kvm_usage_lock);

if (kvm_usage_count++)
return 0;

r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
kvm_online_cpu, kvm_offline_cpu);
if (r)
goto err_cpuhp;

register_syscore_ops(&kvm_syscore_ops);

/*
* Undo virtualization enabling and bail if the system is going down.
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
* possible for an in-flight operation to enable virtualization after
* syscore_shutdown() is called, i.e. without kvm_shutdown() being
* invoked. Note, this relies on system_state being set _before_
* kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
* or this CPU observes the impending shutdown. Which is why KVM uses
* a syscore ops hook instead of registering a dedicated reboot
* notifier (the latter runs before system_state is updated).
*/
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
system_state == SYSTEM_RESTART) {
r = -EBUSY;
goto err_rebooting;
}

return 0;

err_rebooting:
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
err_cpuhp:
--kvm_usage_count;
return r;
}

static void hardware_disable_all(void)
{
guard(mutex)(&kvm_usage_lock);

if (--kvm_usage_count)
return;

unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
}
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
static int hardware_enable_all(void)
{
Expand Down Expand Up @@ -6461,15 +6429,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
int r;
int cpu;

#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
kvm_online_cpu, kvm_offline_cpu);
if (r)
return r;

register_syscore_ops(&kvm_syscore_ops);
#endif

/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
Expand All @@ -6480,10 +6439,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
offsetofend(struct kvm_vcpu, stats_id)
- offsetof(struct kvm_vcpu, arch),
NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto err_vcpu_cache;
}
if (!kvm_vcpu_cache)
return -ENOMEM;

for_each_possible_cpu(cpu) {
if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
Expand Down Expand Up @@ -6540,11 +6497,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
err_vcpu_cache:
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
Expand All @@ -6566,10 +6518,6 @@ void kvm_exit(void)
kmem_cache_destroy(kvm_vcpu_cache);
kvm_vfio_ops_exit();
kvm_async_pf_deinit();
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
kvm_irqfd_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);
Expand Down

0 comments on commit 9a798b1

Please sign in to comment.