Skip to content

Commit

Permalink
KVM: x86/mmu: Zap invalidated roots via asynchronous worker
Browse files Browse the repository at this point in the history
Use the system worker threads to zap the roots invalidated
by the TDP MMU's "fast zap" mechanism, implemented by
kvm_tdp_mmu_invalidate_all_roots().

At this point, apart from allowing some parallelism in the zapping of
roots, the workqueue is a glorified linked list: work items are added and
flushed entirely within a single kvm->slots_lock critical section.  However,
the workqueue fixes a latent issue where kvm_mmu_zap_all_invalidated_roots()
assumes that it owns a reference to all invalid roots; therefore, no
one can set the invalid bit outside kvm_mmu_zap_all_fast().  Putting the
invalidated roots on a linked list... erm, on a workqueue ensures that
tdp_mmu_zap_root_work() only puts back those extra references that
kvm_mmu_zap_all_invalidated_roots() had gifted to it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
  • Loading branch information
Paolo Bonzini committed Mar 8, 2022
1 parent bb95dfb commit 22b94c4
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 87 deletions.
2 changes: 2 additions & 0 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <linux/cpumask.h>
#include <linux/irq_work.h>
#include <linux/irq.h>
#include <linux/workqueue.h>

#include <linux/kvm.h>
#include <linux/kvm_para.h>
Expand Down Expand Up @@ -1217,6 +1218,7 @@ struct kvm_arch {
* the thread holds the MMU lock in write mode.
*/
spinlock_t tdp_mmu_pages_lock;
struct workqueue_struct *tdp_mmu_zap_wq;
#endif /* CONFIG_X86_64 */

/*
Expand Down
5 changes: 1 addition & 4 deletions arch/x86/kvm/mmu/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -5752,11 +5752,8 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Deferring the zap until the final reference to the root is put would
* lead to use-after-free.
*/
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_invalidated_roots(kvm);
read_unlock(&kvm->mmu_lock);
}
}

static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
Expand Down
8 changes: 7 additions & 1 deletion arch/x86/kvm/mmu/mmu_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,13 @@ struct kvm_mmu_page {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
};
DECLARE_BITMAP(unsync_child_bitmap, 512);
union {
DECLARE_BITMAP(unsync_child_bitmap, 512);
struct {
struct work_struct tdp_mmu_async_work;
void *tdp_mmu_async_data;
};
};

struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
Expand Down
148 changes: 66 additions & 82 deletions arch/x86/kvm/mmu/tdp_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
kvm->arch.tdp_mmu_zap_wq =
alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);

return true;
}
Expand All @@ -46,12 +48,16 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;

flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);

WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));

/*
* Ensure that all the outstanding RCU callbacks to free shadow pages
* can run before the VM is torn down.
* can run before the VM is torn down. Work items on tdp_mmu_zap_wq
* can call kvm_tdp_mmu_put_root and create new callbacks.
*/
rcu_barrier();
}
Expand Down Expand Up @@ -81,6 +87,43 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);

static void tdp_mmu_zap_root_work(struct work_struct *work)
{
struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
tdp_mmu_async_work);
struct kvm *kvm = root->tdp_mmu_async_data;

read_lock(&kvm->mmu_lock);

/*
* A TLB flush is not necessary as KVM performs a local TLB flush when
* allocating a new root (see kvm_mmu_load()), and when migrating vCPU
* to a different pCPU. Note, the local TLB flush on reuse also
* invalidates any paging-structure-cache entries, i.e. TLB entries for
* intermediate paging structures, that may be zapped, as such entries
* are associated with the ASID on both VMX and SVM.
*/
tdp_mmu_zap_root(kvm, root, true);

/*
* Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
* avoiding an infinite loop. By design, the root is reachable while
* it's being asynchronously zapped, thus a different task can put its
* last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
* asynchronously zapped root is unavoidable.
*/
kvm_tdp_mmu_put_root(kvm, root, true);

read_unlock(&kvm->mmu_lock);
}

static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
root->tdp_mmu_async_data = kvm;
INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
}

void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
Expand Down Expand Up @@ -892,6 +935,13 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
int i;

/*
* Zap all roots, including invalid roots, as all SPTEs must be dropped
* before returning to the caller. Zap directly even if the root is
* also being zapped by a worker. Walking zapped top-level SPTEs isn't
* all that expensive and mmu_lock is already held, which means the
* worker has yielded, i.e. flushing the work instead of zapping here
* isn't guaranteed to be any faster.
*
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
Expand All @@ -902,96 +952,28 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
}
}

static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root)
{
struct kvm_mmu_page *next_root;

if (prev_root)
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&prev_root->link,
typeof(*prev_root), link);
else
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
typeof(*next_root), link);

while (next_root && !(next_root->role.invalid &&
refcount_read(&next_root->tdp_mmu_root_count)))
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&next_root->link,
typeof(*next_root), link);

return next_root;
}

/*
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
* reference to each invalidated root, roots will not be freed until after this
* function drops the gifted reference, e.g. so that vCPUs don't get stuck with
* tearing down paging structures.
* zap" completes.
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
struct kvm_mmu_page *next_root;
struct kvm_mmu_page *root;

lockdep_assert_held_read(&kvm->mmu_lock);

rcu_read_lock();

root = next_invalidated_root(kvm, NULL);

while (root) {
next_root = next_invalidated_root(kvm, root);

rcu_read_unlock();

/*
* A TLB flush is unnecessary, invalidated roots are guaranteed
* to be unreachable by the guest (see kvm_tdp_mmu_put_root()
* for more details), and unlike the legacy MMU, no vCPU kick
* is needed to play nice with lockless shadow walks as the TDP
* MMU protects its paging structures via RCU. Note, zapping
* will still flush on yield, but that's a minor performance
* blip and not a functional issue.
*/
tdp_mmu_zap_root(kvm, root, true);

/*
* Put the reference acquired in
* kvm_tdp_mmu_invalidate_roots
*/
kvm_tdp_mmu_put_root(kvm, root, true);

root = next_root;

rcu_read_lock();
}

rcu_read_unlock();
flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
}

/*
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
* is about to be zapped, e.g. in response to a memslots update. The caller is
* responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
* zapping.
* is about to be zapped, e.g. in response to a memslots update. The actual
* zapping is performed asynchronously, so a reference is taken on all roots.
* Using a separate workqueue makes it easy to ensure that the destruction is
* performed before the "fast zap" completes, without keeping a separate list
* of invalidated roots; the list is effectively the list of work items in
* the workqueue.
*
* Take a reference on all roots to prevent the root from being freed before it
* is zapped by this thread. Freeing a root is not a correctness issue, but if
* a vCPU drops the last reference to a root prior to the root being zapped, it
* will get stuck with tearing down the entire paging structure.
*
* Get a reference even if the root is already invalid,
* kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
* invalid roots, e.g. there's no epoch to identify roots that were invalidated
* by a previous call. Roots stay on the list until the last reference is
* dropped, so even though all invalid roots are zapped, a root may not go away
* for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
*
* Because mmu_lock is held for write, it should be impossible to observe a
* root with zero refcount, i.e. the list of roots cannot be stale.
* Get a reference even if the root is already invalid, the asynchronous worker
* assumes it was gifted a reference to the root it processes. Because mmu_lock
* is held for write, it should be impossible to observe a root with zero refcount,
* i.e. the list of roots cannot be stale.
*
* This has essentially the same effect for the TDP MMU
* as updating mmu_valid_gen does for the shadow MMU.
Expand All @@ -1002,8 +984,10 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)

lockdep_assert_held_write(&kvm->mmu_lock);
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
root->role.invalid = true;
tdp_mmu_schedule_zap_root(kvm, root);
}
}
}

Expand Down

0 comments on commit 22b94c4

Please sign in to comment.